library("ggplot2")
library("ggpubr")
library(plotly)
library(webshot)
#***********************************************************************************************************
## Correlation between IRS and the fraction of Intergenic and exonic IESs ("nuclear prevalences")_Fig5A
#***********************************************************************************************************
# Load IES table
IES_tab <- read.table('  path to IES table  "Supplemental Table S2" ', 
                      h = T, stringsAsFactors = F)
length(na.omit(IES_tab$IRS_25F0[IES_tab$IRS_25F0 > 0.1 & IES_tab$IES_LOCATION=='INTERGENIC']))/ length(na.omit(IES_tab$IRS_25F0[IES_tab$IRS_25F0 > 0.1]))

#*****************************************************************
# Intergenic Fraction ~ IRS
#*****************************************************************
Temp <- c("IRS_18F1", "IRS_25F0", "IRS_25F1", "IRS_32F1")
S <- seq(0,0.95,0.05)
IntergenicDF <- data.frame(matrix(nrow = length(S)*length(Temp), ncol = 3))
colnames(IntergenicDF) <- c("IRS", "INT_Fraction", "Sample")


w=0
for(j in 1:length(Temp)){
  for(i in 1:length(S)){
    w=w+1
    IntergenicDF[w,] <-c(S[i], length(na.omit(IES_tab[,Temp[j]][IES_tab[,Temp[j]] > S[i] & IES_tab$IES_LOCATION=='INTERGENIC']))/ 
                           length(na.omit(IES_tab[,Temp[j]][IES_tab[,Temp[j]] > S[i]])), Temp[j])
  }
}


IntergenicDF$Sample <- as.character(IntergenicDF$Sample)
IntergenicDF$INT_Fraction <- as.numeric(IntergenicDF$INT_Fraction)
IntergenicDF$IRS <- as.numeric(IntergenicDF$IRS)
Intergenic <- IntergenicDF

#*****************************************************************
# Exonic Fraction ~ IRS
#*****************************************************************
Temp <- c("IRS_18F1", "IRS_25F0", "IRS_25F1", "IRS_32F1")
S <- seq(0,0.95,0.05)
ExonicDF <- data.frame(matrix(nrow = length(S)*length(Temp), ncol = 3))
colnames(ExonicDF) <- c("IRS", "INT_Fraction", "Sample")


w=0
for(j in 1:length(Temp)){
  for(i in 1:length(S)){
    w=w+1
    ExonicDF[w,] <-c(S[i], length(na.omit(IES_tab[,Temp[j]][IES_tab[,Temp[j]] > S[i] & IES_tab$IES_LOCATION=='EXON']))/ 
                           length(na.omit(IES_tab[,Temp[j]][IES_tab[,Temp[j]] > S[i]])), Temp[j])
  }
}


ExonicDF$Sample <- as.character(ExonicDF$Sample)
ExonicDF$INT_Fraction <- as.numeric(ExonicDF$INT_Fraction)
ExonicDF$IRS <- as.numeric(ExonicDF$IRS)
Exonic <- ExonicDF

test <- rbind(Intergenic, Exonic)

ggplot(data = test, 
       aes(x = IRS, 
           y = INT_Fraction, 
           color = Sample)) + 
  xlab("IRS") +  
  ylab("Somatic IESs (fraction)") +
  xlim(0, 1) + ylim(0, 1) + 
  theme_classic() + 
  theme(axis.text=element_text(size=25), 
        axis.title.y=element_text(size=30, hjust=0.5, vjust = +10), 
        axis.title.x=element_text(size=30, vjust= -100),
        legend.position = "top",
        legend.text=element_text(size=20),
        plot.margin=unit(c(1,1,1.5,1.2),"cm")) +
  scale_color_manual(labels = c("18°C (F1)", "25°C (F0)", "25°C (F1)", "32°C (F1)"), values=c("dark green", "dark blue", "sky blue", "firebrick")) + 
  geom_label(x=0.6, y=0.8, label="Intergenic", col = "black", size = 8) +
  geom_label(x=0.6, y=0.1, label="Exonic", col = "black", size = 8) +
  geom_smooth(data =Intergenic, aes(x = IRS, y = INT_Fraction), method = "auto", span = 0.4, se= T, fill = "#DCDCDC") + 
  geom_smooth(data =Exonic, aes(x = IRS, y = INT_Fraction), method = "auto", span = 0.4, se = T, fill = "#DCDCDC") +
  geom_point(size = 2, alpha = 1) +
  guides(color=guide_legend(title=NULL))

#*******************************************************************************************************************************************
## Fig5B_v6_BoxPlot comparing the Expression levels of genes affected by non-trivial retention (IRS > 0.1) with those of the full MAC set
## RNAseq data from PTET strain d12_Fig5B
#*******************************************************************************************************************************************

# Build data frame
# IRS > 0.1
genes_df25_R1 <- IES_tab$TPM_25_R1_D12[!is.na(IES_tab$IES_ID) & 
                                         !duplicated(IES_tab$IES_ID) & 
                                         IES_tab$IRS_25F1 > 0.1 & 
                                         IES_tab$IES_LOCATION == "EXON"]

genes_df25_R2 <- IES_tab$TPM_25_R2_D12[!is.na(IES_tab$IES_ID) & 
                                         !duplicated(IES_tab$IES_ID) & 
                                         IES_tab$IRS_25F1 > 0.1 & 
                                         IES_tab$IES_LOCATION == "EXON"]

genes_df25_R3 <- IES_tab$TPM_25_R3_D12[!is.na(IES_tab$IES_ID) & 
                                         !duplicated(IES_tab$IES_ID) & 
                                         IES_tab$IRS_25F1 > 0.1 & 
                                         IES_tab$IES_LOCATION == "EXON"]

genes_df25_R1_R2_R3 <- cbind(genes_df25_R1,genes_df25_R2, genes_df25_R3)
genes_df25 <- apply(genes_df25_R1_R2_R3, 1, mean)

genes_df25 <- as.data.frame(genes_df25)
colnames(genes_df25) <- 'VEG'
genes_df25$SET <- 'Genes with somatic IESs'
head(genes_df25)
summary(genes_df25$VEG)

# Full set of Paramecium macronuclear IES-containing genes

Expr_25_R1 <- IES_tab$TPM_25_R1_D12[!is.na(IES_tab$GENE_ID) & 
                                      !duplicated(IES_tab$GENE_ID)]

Expr_25_R2 <- IES_tab$TPM_25_R2_D12[!is.na(IES_tab$GENE_ID) & 
                                      !duplicated(IES_tab$GENE_ID)]

Expr_25_R3 <- IES_tab$TPM_25_R3_D12[!is.na(IES_tab$GENE_ID) & 
                                      !duplicated(IES_tab$GENE_ID)]

genes_FULL_combo <- cbind(Expr_25_R1,
                          Expr_25_R2,
                          Expr_25_R3)


genes_FULL <- apply(genes_FULL_combo, 1, mean)
genes_FULL <- as.data.frame(genes_FULL)
colnames(genes_FULL) <- 'VEG'
genes_FULL$SET <- 'All genes'
head(genes_FULL)
summary(genes_FULL$VEG)

genes_df2 <- rbind(genes_df25, genes_FULL)

p <- ggboxplot(genes_df2, x = "SET", y = "VEG",
               fill = "SET", 
               width = 0.5,
               palette =c("firebrick", "#00AFBB"),
               add = "none", shape = "SET",
               notch = T,
               out = F,
               outlier.shape=NA,
               font.label = list(size = 20, face = "plain"),
               ylim = c(0, 25), xlab = "", ylab = "Expression level (TPM)")

my_comparisons <- list(c("Genes with somatic IESs", "All genes"))
p + stat_compare_means(comparisons = my_comparisons, label = "p.format", p.adjust.method = "BH") +
stat_compare_means(label.y = 20)

p +
  annotate("segment", x=c(1,2,1), xend=c(1,2,2), y= c(19, 19, 20), yend=c(20, 20, 20), size = 0.6) +
  annotate("text",x=1.5,y=20.5,label="***", size = 8) +
  theme(plot.margin=unit(c(1,1,1.5,1.2),"cm")) +
  theme(axis.text=element_text(size=25), 
        axis.title.y=element_text(size=40, hjust=0.5, vjust = +8), 
        axis.title.x=element_blank(),
        axis.text.x=element_blank()) + theme(legend.position="none")

#*************************************************************************
## Display IES reference distribution (PGM-Ref) with a pie chart_Fig5C
#*************************************************************************

Exon_PGMref <- length(IES_tab$IES_ID[!is.na(IES_tab$IES_ID) &
                                       !duplicated(IES_tab$IES_ID) & 
                                       IES_tab$IES_LOCATION=='EXON'])/length(IES_tab$IES_ID[!is.na(IES_tab$IES_ID) & 
                                                                                              !duplicated(IES_tab$IES_ID)])
Intergenic_PGMref <- length(IES_tab$IES_ID[!is.na(IES_tab$IES_ID) &
                                             !duplicated(IES_tab$IES_ID) & 
                                             IES_tab$IES_LOCATION=='INTERGENIC'])/length(IES_tab$IES_ID[!is.na(IES_tab$IES_ID) & 
                                                                                                          !duplicated(IES_tab$IES_ID)])
Intron_PGMref <- length(IES_tab$IES_ID[!is.na(IES_tab$IES_ID) &
                                         !duplicated(IES_tab$IES_ID) & 
                                         IES_tab$IES_LOCATION=='INTRON'])/length(IES_tab$IES_ID[!is.na(IES_tab$IES_ID) & 
                                                                                                  !duplicated(IES_tab$IES_ID)])
ncRNAs_PGMref <- length(IES_tab$IES_ID[!is.na(IES_tab$IES_ID) &
                                         !duplicated(IES_tab$IES_ID) & 
                                         IES_tab$IES_LOCATION=='NC'])/length(IES_tab$IES_ID[!is.na(IES_tab$IES_ID) & 
                                                                                              !duplicated(IES_tab$IES_ID)])

frame <- as.data.frame(rbind(Exon_PGMref, Intergenic_PGMref, Intron_PGMref, ncRNAs_PGMref))
colnames(frame) <- "PGMref"
frame$Compartment <- rownames(frame)
data <- frame[,c('Compartment', 'PGMref')]
colors <- c('rgb(114,147,203)', 'rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(211,94,96)')

p <- plot_ly(data, labels = ~Compartment, values = ~PGMref, type = 'pie',
             textposition = 'inside',
             textinfo = 'label+percent',
             insidetextfont = list(color = 'black', size= 15),
             outsidetextfont = list(color = 'black', size= 50),
             hoverinfo = 'text',
             text = ~paste('Compartment:', Compartment),
             marker = list(colors = colors,
                           line = list(color = '#FFFFFF', width = 1.5)),
             #The 'pull' attribute can also be used to create space between the sectors
             showlegend = F) %>%
  layout(title = 'Distribution of PGM-ref IESs across Genomic Compartments',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         autosize = T)
p

#***********************************************************************************
## Display genomic distribution of significantly retained IESs with a pie chart_Fig5C
# Run the script "PDE efficiency across temperature" before executing this script
#***********************************************************************************
NC32F1 <- length(as.numeric(as.character(SIGN32_dist$IRS_32F1[SIGN32_dist$SIGNIFICANT == 'TRUE' & SIGN32_dist$IES_LOCATION=='NC'])))
EXON32F1 <- length(as.numeric(as.character(SIGN32_dist$IRS_32F1[SIGN32_dist$SIGNIFICANT == 'TRUE' & SIGN32_dist$IES_LOCATION=='EXON'])))
INTER32F1 <- length(as.numeric(as.character(SIGN32_dist$IRS_32F1[SIGN32_dist$SIGNIFICANT == 'TRUE' & SIGN32_dist$IES_LOCATION=='INTERGENIC'])))
INTRON32F1 <- length(as.numeric(as.character(SIGN32_dist$IRS_32F1[SIGN32_dist$SIGNIFICANT == 'TRUE' & SIGN32_dist$IES_LOCATION=='INTRON'])))

NC18F1 <- length(as.numeric(as.character(SIGN18_dist$IRS_18F1[SIGN18_dist$SIGNIFICANT == 'TRUE' & SIGN18_dist$IES_LOCATION=='NC'])))
EXON18F1 <- length(as.numeric(as.character(SIGN18_dist$IRS_18F1[SIGN18_dist$SIGNIFICANT == 'TRUE' & SIGN18_dist$IES_LOCATION=='EXON'])))
INTER18F1 <- length(as.numeric(as.character(SIGN18_dist$IRS_18F1[SIGN18_dist$SIGNIFICANT == 'TRUE' & SIGN18_dist$IES_LOCATION=='INTERGENIC'])))
INTRON18F1 <- length(as.numeric(as.character(SIGN18_dist$IRS_18F1[SIGN18_dist$SIGNIFICANT == 'TRUE' & SIGN18_dist$IES_LOCATION=='INTRON'])))

NC25F1 <- length(as.numeric(as.character(SIGN25_dist$IRS_25F1[SIGN25_dist$SIGNIFICANT == 'TRUE' & SIGN25_dist$IES_LOCATION=='NC'])))
EXON25F1 <- length(as.numeric(as.character(SIGN25_dist$IRS_25F1[SIGN25_dist$SIGNIFICANT == 'TRUE' & SIGN25_dist$IES_LOCATION=='EXON'])))
INTER25F1 <- length(as.numeric(as.character(SIGN25_dist$IRS_25F1[SIGN25_dist$SIGNIFICANT == 'TRUE' & SIGN25_dist$IES_LOCATION=='INTERGENIC'])))
INTRON25F1 <- length(as.numeric(as.character(SIGN25_dist$IRS_25F1[SIGN25_dist$SIGNIFICANT == 'TRUE' & SIGN25_dist$IES_LOCATION=='INTRON'])))

c(NC25F1, NC18F1, NC32F1)
c(EXON25F1, EXON18F1, EXON32F1)
c(INTER25F1, INTER18F1, INTER32F1)
c(INTRON25F1, INTRON18F1, INTRON32F1)

Exon <- c(EXON25F1/(EXON25F1+INTER25F1+INTRON25F1+NC25F1),
          EXON18F1/(EXON18F1+INTER18F1+INTRON18F1+NC18F1), 
          EXON32F1/(EXON32F1+INTER32F1+INTRON32F1+NC32F1))

Intergenic <- c(INTER25F1/(EXON25F1+INTER25F1+INTRON25F1+NC25F1),
                INTER18F1/(EXON18F1+INTER18F1+INTRON18F1+NC18F1), 
                INTER32F1/(EXON32F1+INTER32F1+INTRON32F1+NC32F1))

Intron  <- c(INTRON25F1/(EXON25F1+INTER25F1+INTRON25F1+NC25F1),
             INTRON18F1/(EXON18F1+INTER18F1+INTRON18F1+NC18F1), 
             INTRON32F1/(EXON32F1+INTER32F1+INTRON32F1+NC32F1))

ncRNAs <- c(NC25F1/(EXON25F1+INTER25F1+INTRON25F1+NC25F1),
            NC18F1/(EXON18F1+INTER18F1+INTRON18F1+NC18F1), 
            NC32F1/(EXON32F1+INTER32F1+INTRON32F1+NC32F1))

frame <- as.data.frame(rbind(Exon, Intergenic, Intron, ncRNAs))
colnames(frame) <- c("F125", "F118", "F132")
frame$Compartment <- rownames(frame)

# Switch between datasets ("F125", "F118", "F132") to display the genomic distribution of significantly retained IESs
data <- frame[,c('Compartment', 'F125')]

colors <- c('rgb(114,147,203)', 'rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(211,94,96)')

p <- plot_ly(data, labels = ~Compartment, values = ~F125, type = 'pie',
             textposition = 'inside',
             textinfo = 'label+percent',
             outsidetextfont = list(color = 'black', size= 50),
             hoverinfo = 'text',
             text = ~paste('Compartment:', Compartment),
             marker = list(colors = colors,
                           line = list(color = '#FFFFFF', width = 1.5)),
             #The 'pull' attribute can also be used to create space between the sectors
             showlegend = F) %>%
  layout(title = '',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         autosize = T, titlefont = list(color = 'black', size= 15))
p
