#**************************************************************************************************
#******************************** PTC Distance from CDS_START *************************************
#**************************************************************************************************
library(binom)
library(plotly)
library(webshot)
library(magrittr)
library(e1071)
library(Hmisc)
# Load IES table
IES_tab <- read.table(' path to IES table "Supplemental Table S2" ', h = T)

REF_upper=binom.confint(x=na.omit(IES_tab$IES_PLUS_25F0), 
                        n=(na.omit(IES_tab$IES_PLUS_25F0) 
                           + na.omit(IES_tab$IES_MINUS_25F0)), 
                        methods = "exact", conf.level = 0.75)$upper


# Build a table with useful info to which we will add the Padj values and the significance 
RF=cbind(as.character(na.omit(IES_tab$IES_ID)),
         as.character(na.omit(IES_tab$IRS_25F0)),
         REF_upper,
         na.omit(IES_tab$IRS_32F1),
         na.omit(IES_tab$IES_PLUS_25F0),
         na.omit(IES_tab$IES_PLUS_32F1),
         (na.omit(IES_tab$IES_PLUS_25F0) + na.omit(IES_tab$IES_MINUS_25F0)), 
         (na.omit(IES_tab$IES_PLUS_32F1) + na.omit(IES_tab$IES_MINUS_32F1)),
         as.character(na.omit(IES_tab$IES_LOCATION)),
         as.numeric(as.character(na.omit(IES_tab$IRS_25F1))),
         as.numeric(as.character(na.omit(IES_tab$IRS_18F1))),
         as.numeric(as.character((IES_tab$exprM[!is.na(IES_tab$IES_ID)]))),
         as.character((IES_tab$GENE_ID[!is.na(IES_tab$IES_ID)])),
         as.numeric(as.character((IES_tab$IES_LEN[!is.na(IES_tab$IES_ID)]))),
         as.character(as.character((IES_tab$PTC_IS_DOWNSTREAM[!is.na(IES_tab$IES_ID)]))),
         as.character(as.character((IES_tab$PTC_INDUCED[!is.na(IES_tab$IES_ID)]))),
         IES_tab$PTC_INDUCED_DIST_TO_CDS_START[!is.na(IES_tab$IES_ID)],
         IES_tab$GENE_LEN[!is.na(IES_tab$IES_ID)],
         as.character(IES_tab$GENE_ORIENTATION[!is.na(IES_tab$IES_ID)]),
         IES_tab$GENE_START[!is.na(IES_tab$IES_ID)],
         IES_tab$GENE_END[!is.na(IES_tab$IES_ID)],
         IES_tab$IES_POINT[!is.na(IES_tab$IES_ID)],
         IES_tab$CDS_START[!is.na(IES_tab$IES_ID)],
         IES_tab$CDS_END[!is.na(IES_tab$IES_ID)],
         IES_tab$DIST_TO_CDS_START[!is.na(IES_tab$IES_ID)])

RF <- as.data.frame(RF)

colnames(RF) <- c("IES_ID", 
                  "IRS_25F0", 
                  "REF_upper", 
                  "IRS_32F1", 
                  "IES_PLUS_25F0", 
                  "IES_PLUS_32F1", 
                  "Total_25F0", 
                  "Total32F1",
                  "IES_LOCATION",
                  "IRS_25F1",
                  "IRS_18F1",
                  "exprM",
                  "GENE_ID",
                  "IES_LEN",
                  "PTC_IS_DOWNSTREAM",
                  "PTC_INDUCED",
                  "PTC_INDUCED_DIST_TO_CDS_START",
                  "GENE_LEN",
                  "GENE_ORIENTATION",
                  "GENE_START",
                  "GENE_END",
                  "IES_POINT",
                  "CDS_START",
                  "CDS_END",
                  "DIST_TO_CDS_START")


# Builds a binomial table with the REF values (upper bound of 75% C.I.) IES+ reads and Total reads from the sample to be tested
binomtable=cbind(REF_upper, na.omit(IES_tab$IES_PLUS_32F1), (na.omit(IES_tab$IES_PLUS_32F1) + na.omit(IES_tab$IES_MINUS_32F1)))
#colnames(binomtable) <- c("Expected", "Success", "Trials")

# Performs a binomial test where appropriate
get_Freq_Test_Pval=function(tab){
  if(is.na(tab[1]) | (tab[2]==0 & tab[3]==0)){return (NA)}
  else{return (binom.test(x=tab[2], n=tab[3], p=tab[1], alternative="greater")$p.value)}
}

# Apply the function to the binomial table
pvalues=apply(binomtable, 1, 'get_Freq_Test_Pval')
RF$padj=p.adjust(pvalues, method="BH")
RF$SIGNIFICANT = (RF$padj < 0.05 & !is.na(RF$padj) 
                  & (na.omit(IES_tab$IES_PLUS_25F0)+na.omit(IES_tab$IES_MINUS_25F0) > 20) 
                  & (na.omit(IES_tab$IES_PLUS_32F1)+na.omit(IES_tab$IES_MINUS_32F1) > 20))


length(RF$SIGNIFICANT[RF$SIGNIFICANT=='TRUE'])
#*********************************************************************************
## ********************************* Select set **********************************
#*********************************************************************************
# SET1_Significantly retained at 32°C
SET <- RF[RF$SIGNIFICANT=='TRUE' & RF$IES_LOCATION=="EXON",]
# SET2_IRS > desired threshold
#SET <- RF[RF$IES_LOCATION=="EXON" & as.numeric(as.character(RF$IRS_32F1)) > 0.1,]
# SET3_full set
#SET <- IES_tab[!is.na(IES_tab$IES_ID) &
#                     !duplicated(IES_tab$IES_ID) & 
 #                    IES_tab$IES_LOCATION=="EXON",]

SET$DIST_TSS_perc <- (as.numeric(as.character(SET$IES_POINT))-as.numeric(as.character(SET$GENE_START))) /
  as.numeric(as.character(SET$GENE_LEN))*100
SET$CDS_LEN <- abs(as.numeric(as.character(SET$CDS_END)) - as.numeric(as.character(SET$CDS_START)))
SET$CDS_LEN_plus_IES <- SET$CDS_LEN + as.numeric(as.character(SET$IES_LEN))
SET$PTC_INDUCED_CDS_START_perc <- as.numeric(as.character(SET$PTC_INDUCED_DIST_TO_CDS_START)) / 
  SET$CDS_LEN_plus_IES*100

SET$DIST_CDS_START_perc <- ""
for(i in 1:length(SET$IES_ID)){
  if(as.character(SET$GENE_ORIENTATION)[i]=="+"){
    SET$DIST_CDS_START_perc[i] <- (as.numeric(as.character(SET$IES_POINT))[i]-as.numeric(as.character(SET$CDS_START))[i]) /
      as.numeric(as.character(SET$CDS_LEN))[i]*100 
  } else{SET$DIST_CDS_START_perc[i] <- (as.numeric(as.character(SET$CDS_START))[i]-as.numeric(as.character(SET$IES_POINT))[i]) /
    as.numeric(as.character(SET$CDS_LEN))[i]*100 
  }
}

SET$DIST_CDS_START_perc <- as.numeric(SET$DIST_CDS_START_perc)

# 18 UTR-IESs are excluded as they lie outside the CDS
SET <- SET[as.numeric(SET$DIST_CDS_START_perc) > 0 & as.numeric(SET$DIST_CDS_START_perc) < 100,]

#*************************************************************************************************
# Function = Distribution of PTC and non-PTC inducing IESs within the CDS
# Purpose = Asses the phenotypic consequences of IES retention based on location 
# of IES/PTC within the CDS (R2_7)
# Plot = line plot
#*************************************************************************************************

#*********************************************************************************
## PTC distance from CDS start (expressed in %, != IES distance from CDS start)
#*********************************************************************************
PTC <- SET$PTC_INDUCED_CDS_START_perc[!is.na(SET$PTC_INDUCED) &
                                            SET$PTC_INDUCED=='True']

# Not particularly informative as the distribution becomes bimodal with increasing IRS
skewness(PTC)
# Hist
hist(PTC, breaks = 20, 
     col = "dark blue", 
     border = F, add = F, freq = T)
# Kernel density
d_PTC <- density(PTC)
plot(d_PTC, col = 'dark blue', main = "", type = "l", ylim=c(0, 0.015))
# Plot IRS against PTC position within CDS (expressed as %)
PTC_IRS <-  as.numeric(as.character(SET$IRS_32F1[!is.na(SET$PTC_INDUCED) &
                                                       SET$PTC_INDUCED=='True']))

plot(PTC, PTC_IRS, pch = 19, type = "h")
minor.tick(nx=10, ny=0.1)

#*********************************************************************************
## All_PTC-Inducing IESs (distance from CDS start)
#*********************************************************************************
PTC_IES <- na.omit(SET$DIST_CDS_START_perc[!is.na(SET$PTC_INDUCED) &
                                                 SET$PTC_INDUCED=='True'])

skewness(PTC_IES)
# Hist
hist(PTC_IES, breaks = 200, 
     col = "dark red", 
     border = F, add = F, freq = T)
# Kernel density
d_PTC_IES <- density(PTC_IES)
plot(d_PTC_IES, col = 'dark red', main = "", type = "l")
# Plot IRS against IES position within CDS (expressed as %)
PTC_IES_IRS <-  as.numeric(as.character(SET$IRS_32F1[!is.na(SET$PTC_INDUCED) & 
                                                           SET$PTC_INDUCED=='True']))

plot(PTC_IES, PTC_IES_IRS, pch = 19, type = "h")
minor.tick(nx=10, ny=1)

#*********************************************************************************
# 3n-IES PTC-inducing
#*********************************************************************************
PTC_3n_IES <- na.omit(SET$DIST_CDS_START_perc[!is.na(SET$PTC_INDUCED) &
                                                    SET$PTC_INDUCED=='True' &
                                                    SET$PTC_IS_DOWNSTREAM=='False' &
                                                    as.numeric(as.character(SET$IES_LEN))%%3 == 0])

skewness(PTC_3n_IES)
# Hist
hist(PTC_3n_IES, breaks = 20, 
     col = "dark green", 
     border = F, add = F, freq = T)
# Kernel density
d_PTC_3n_IES <- density(PTC_3n_IES)
plot(d_PTC_3n_IES, col = 'dark green', main = "", type = "l")
# Plot IRS against IES position within CDS (expressed as %)
PTC_3n_IES_IRS <-  as.numeric(as.character(SET$IRS_32F1[!is.na(SET$PTC_INDUCED) &
                                                              SET$PTC_INDUCED=='True' &
                                                              SET$PTC_IS_DOWNSTREAM=='False' &
                                                              as.numeric(as.character(SET$IES_LEN))%%3 == 0]))

plot(PTC_3n_IES, PTC_3n_IES_IRS, pch = 19, type = "h")

#*********************************************************************************
# 3n-IES
#*********************************************************************************
IES_3n <- na.omit(SET$DIST_CDS_START_perc[!is.na(SET$PTC_INDUCED) & 
                                                as.numeric(as.character(SET$IES_LEN))%%3 == 0])

skewness(IES_3n)
# Hist
hist(IES_3n, breaks = 20, 
     col = "dark orange", 
     border = F, add = F, freq = T)
# Kernel density
d_IES_3n <- density(IES_3n)
plot(d_IES_3n, col = 'dark orange', main = "", type = "l")
# Plot IRS against IES position within CDS (expressed as %)
IES_3n_IRS <-  as.numeric(as.character(SET$IRS_32F1[!is.na(SET$PTC_INDUCED) & 
                                                          as.numeric(as.character(SET$IES_LEN))%%3 == 0]))

plot(IES_3n, IES_3n_IRS, pch = 19, type = "h")

#*********************************************************************************
# 3n-IES, non-PTC-inducing
#*********************************************************************************
non_PTC_3n_IES <- na.omit(SET$DIST_CDS_START_perc[!is.na(SET$PTC_INDUCED) & 
                                                        SET$PTC_INDUCED=='False' &
                                                        as.numeric(as.character(SET$IES_LEN))%%3 == 0])
d_non_PTC_3n_IES <- density(non_PTC_3n_IES)
plot(d_non_PTC_3n_IES, col = 'purple', main = "", type = "l")

#*********************************************************************************
# Frameshift PTC downstream to the insertion (FS-PTC)
#*********************************************************************************
FS_PTC_IES <- na.omit(SET$DIST_CDS_START_perc[!is.na(SET$PTC_INDUCED) &
                                                    SET$PTC_INDUCED=='True' &
                                                    SET$PTC_IS_DOWNSTREAM=='True'])
d_FS_PTC_IES <- density(FS_PTC_IES)
plot(d_FS_PTC_IES, col = 'black', main = "", type = "l")

#*********************************************************************************
# Tail-FS IES
#*********************************************************************************
Tail_FS_IES <- na.omit(SET$DIST_CDS_START_perc[!is.na(SET$PTC_INDUCED) & 
                                                     SET$PTC_INDUCED=='False' &
                                                     as.numeric(as.character(SET$IES_LEN))%%3 != 0])
d_Tail_FS_IES <- density(Tail_FS_IES)
plot(d_Tail_FS_IES, col = 'gold', main = "", type = "l")


#***********************************************************************************************
# Function = Distribution of PTC and non-PTC inducing IESs in CDS
# Purpose = Asses the protein diversification potential of IES retantion in CDS
# Plot = Pie chart
#***********************************************************************************************

# Fraction of IES-Induced PTCs in CDS
length(which(as.logical(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED)])))/length(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED)])
summary(as.numeric(as.character(na.omit(SET$IRS_32F1[!is.na(SET$PTC_INDUCED) & 
                                                           SET$PTC_INDUCED=='True']))))

# Broken down into a) PTC within IES (PTC-IES) and b) Frameshift PTC in dowstream CDS (FS-IES)
PTC_IES <- length(na.omit(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED) & SET$PTC_INDUCED=='True' & 
                                                SET$PTC_IS_DOWNSTREAM=='False']))/length(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED)])
summary(as.numeric(as.character(na.omit(SET$IRS_32F1[!is.na(SET$PTC_INDUCED) & 
                                                           SET$PTC_INDUCED=='True' & 
                                                           SET$PTC_IS_DOWNSTREAM=='False']))))

FS_IES <- length(na.omit(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED) & SET$PTC_INDUCED=='True' & SET$PTC_IS_DOWNSTREAM=='True']))/length(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED)])
summary(as.numeric(as.character(na.omit(SET$IRS_32F1[!is.na(SET$PTC_INDUCED) & SET$PTC_INDUCED=='True' & SET$PTC_IS_DOWNSTREAM=='True']))))

# Fraction of non-PTCs inducing IESs in CDS
length(na.omit(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED) & 
                                     SET$PTC_INDUCED=='False']))/length(na.omit(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED)]))
summary(as.numeric(as.character(na.omit(SET$IRS_32F1[!is.na(SET$PTC_INDUCED) & 
                                                           SET$PTC_INDUCED=='False']))))

# Broken down into a) 3n-IES (Pro-IESs) and b) tail-FS
Pro_IES <- length(na.omit(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED) & 
                                               SET$PTC_INDUCED=='False' & 
                                               as.numeric(as.character(SET$IES_LEN))%%3 == 0]))/length(na.omit(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED)]))
summary(as.numeric(as.character(na.omit(SET$IRS_32F1[!is.na(SET$PTC_INDUCED) & 
                                                           SET$PTC_INDUCED=='False' & 
                                                           as.numeric(as.character(SET$IES_LEN))%%3 == 0]))))

Tail_IES <- length(na.omit(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED) & 
                                                        SET$PTC_INDUCED=='False' & 
                                                        as.numeric(as.character(SET$IES_LEN))%%3 != 0]))/length(na.omit(SET$PTC_INDUCED[!is.na(SET$PTC_INDUCED)]))
summary(as.numeric(as.character(na.omit(SET$IRS_32F1[!is.na(SET$PTC_INDUCED) & 
                                                           SET$PTC_INDUCED=='False' & 
                                                           as.numeric(as.character(SET$IES_LEN))%%3 != 0]))))

## Display reference distribution (PGM-Ref) in a pie chart
frame <- as.data.frame(rbind(PTC_IES, FS_IES, Tail_IES, Pro_IES))
colnames(frame) <- "CDS_IES"
frame$Compartment <- c("PTC-IES", "FS-IES", "TAIL-IES", "Pro-IES")

data <- frame[,c('Compartment', 'CDS_IES')]
colors <- c('#CD5C5C', '#FF7F50', '#DC143C', '#3CB371')

p <- plot_ly(data, sort = T,labels = ~Compartment, values = ~CDS_IES, type = 'pie',
             textposition = 'inside',
             textinfo = 'label+percent',
             insidetextfont = list(color = 'black', size= 15),
             hoverinfo = 'text',
             text = ~paste('Compartment:', Compartment),
             marker = list(colors = colors,
                           line = list(color = '#FFFFFF', width = 1.5)),
             #The 'pull' attribute can also be used to create space between the sectors
             showlegend = F) %>%
  layout(title = 'Classification of the outcome of IES insertions in CDS',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         autosize = T)
p

