# ---------------------------------------------------------------------------------------
#
# Title   : Intergenic ORFs as elementary structural modules of de novo gene birth and protein evolution
# Authors : Chris Papadopoulos, Isabelle Callebaut, Jean-Christophe Gelly, Isabelle Hatin, Olivier Namy, Maxime Renard, Olivier Lespinet, Anne Lopes
# Date    : 14/09/2021
# Script  : Data Analysis & Figures generation
#
# ---------------------------------------------------------------------------------------

# ------------------- #
#  I M P O R T A N T  #
# ------------------- # --------------------------------------------------------------- #
# The "files.path" variable must be modified as it is the relative path for all the 
# data tables and all the figures that will be generated. 
files.path           = '/Users/christospapadopoulos/Desktop/Papadopoulos_Supplemental_Data/'
# ------------------------------------------------------------------------------------- #


# ================== #
#     FUNCTIONS      #
# ================== #
calculate.cluster.size.list = function(table,corr.seq){
  my.list = list()
  for (i in 1:length(table)){
    seq = as.character(table[i])
    if (corr.seq == 'yes'){
      seq = correct_sequence(table[i])
    }
    seq.clusters = unlist(strsplit(as.character(seq),split='\\.'))
    seq.clusters = seq.clusters[seq.clusters != ""]
    #seq.clusters = seq.clusters[which(nchar(seq.clusters) > 4)]
    my.list[[i]] = nchar(seq.clusters)
  }
  return(my.list)
}

calculate.linker.size.list = function(table,corr.seq){
  my.list = list()
  for (i in 1:length(table)){
    seq = as.character(table[i])
    if (corr.seq == 'yes'){
      seq = correct_sequence(table[i])
    }
    seq.non.clusters = unlist(strsplit(as.character(seq),split='[0|1|h|b]'))
    if ( startsWith(x = as.character(seq), prefix = '.') ){
      seq.non.clusters = seq.non.clusters[-1]
    }
    if ( endsWith(x = as.character(seq), suffix = '.') ){
      seq.non.clusters = seq.non.clusters[-length(seq.non.clusters)]
    }
    seq.non.clusters = seq.non.clusters[seq.non.clusters != ""]
    my.list[[i]]  = nchar(seq.non.clusters) 
  }
  return(my.list)
}

calculate.extremities.size.list = function(table,corr.seq){
  my.list = list()
  for (i in 1:length(table)){
    seq = as.character(table[i])
    if (corr.seq == 'yes'){
      seq = correct_sequence(table[i])
    }
    seq.non.clusters = unlist(strsplit(as.character(seq),split='[0|1|h|b]'))
    extremities = NULL
    # Here we have a completly disordered:
    if (length(unique(unlist(strsplit(seq,split = '')))) == 1 ){
      if(unique(unlist(strsplit(seq,split = ''))) == "."){
        #my.list[[i]] = nchar(seq)
        next
      }
    }
    if ( startsWith(x = as.character(seq), prefix = '.') ){
      extremities = c(extremities, seq.non.clusters[1])
    }
    if ( endsWith(x = as.character(seq), suffix = '.') ){
      extremities = c(extremities, seq.non.clusters[length(seq.non.clusters)])
    }
    my.list[[i]]  = nchar(extremities) 
  }
  return(my.list)
}

calculate.disordered.size.list = function(table,corr.seq){
  my.list = list()
  for (i in 1:length(table)){
    seq = as.character(table[i])
    if (corr.seq == 'yes'){
      seq = correct_sequence(table[i])
    }
    seq.non.clusters = unlist(strsplit(as.character(seq),split='[0|1]'))
    # Here we have a completly disordered:
    if (length(unique(unlist(strsplit(seq,split = '')))) == 1 ){
      if(unique(unlist(strsplit(seq,split = ''))) == "."){
        my.list[[i]] = nchar(seq)
      }
    }
  }
  return(my.list)
}

correct_sequence = function(seq){
  split.seq = unlist(strsplit( gsub("([0-9a-z]+)","~\\1~",as.character(seq)), "~" ))
  for (j in  1:length(split.seq)){
    if (nchar(split.seq[j]) <= 4){
      split.seq[j] = paste(rep('.',nchar(split.seq[j])),collapse = '')
    }
  }
  split.seq = paste(split.seq,collapse = '')
  split.seq = unlist(strsplit( gsub("([0-9]+)","~\\1~",split.seq), "~" ))
  return(paste(split.seq,collapse = ''))
}

multiple.t.test = function(table1,table2){
  pvalues = NULL
  for (i in 1:1000){
    pval.tmp = t.test(table1[sample(x = c(1:length(table1)),size = 400,replace = T)],table2[sample(x = c(1:length(table2)),size = 400,replace = T)])$p.value
    pvalues = c(pvalues,pval.tmp)
  }
  return(mean(pvalues))
}

multiple.ks.test = function(table1,table2,type){
  pvalues = NULL
  for (i in 1:1000){
    if (length(table1) > 500){s1 = sample(table1,size = 500)}else{s1 = table1}
    if (length(table2) > 500){s2 = sample(table2,size = 500)}else{s2 = table2}
    pval.tmp = ks.test(s1,s2,alternative = type)$p.value
    pvalues = c(pvalues,pval.tmp)
  }
  return(mean(pvalues))
}

asteriscs.multiple.ks.test = function(tableA,tableB,type){
  my.pval = suppressWarnings(multiple.ks.test(table1 = tableA , table2 = tableB,type = type))
  if(my.pval > 0.05){return("NS")}
  else if(my.pval <= 0.05 & my.pval > 0.01){return("*")}
  else if(my.pval <= 0.01 & my.pval > 0.001){return("**")}
  else if(my.pval <= 0.001){return("***")}
}

multiple.wilcox.test = function(table1,table2,type){
  pvalues = NULL
  for (i in 1:1000){
    if (length(table1) > 500){s1 = sample(table1,size = 500)}else{s1 = table1}
    if (length(table2) > 500){s2 = sample(table2,size = 500)}else{s2 = table2}
    pval.tmp = wilcox.test(s1,s2,alternative = type,exact = FALSE)$p.value
    pvalues = c(pvalues,pval.tmp)
  }
  return(mean(pvalues))
}

asteriscs.multiple.wilcox.test = function(tableA,tableB,type){
  my.pval = suppressWarnings(multiple.wilcox.test(table1 = tableA , table2 = tableB,type = type))
  if(my.pval > 0.05){return("")}
  else if(my.pval <= 0.05 & my.pval > 0.01){return("*")}
  else if(my.pval <= 0.01 & my.pval > 0.001){return("**")}
  else if(my.pval <= 0.001){return("***")}
}

multiple.cor.test = function(table1,table2){
  pvalues = NULL
  for (i in 1:1000){
    my.sample = sample(x = c(1:length(table1)),size = 1000,replace = T)
    pval.tmp = cor.test(table1[my.sample],table2[my.sample])$p.value
    pvalues = c(pvalues,pval.tmp)
  }
  return(mean(pvalues))
}

multiple.prop.test.HCA.foldable = function(table1,table2,type){
  pvalues = NULL
  for (i in 1:1000){
    if (length(table1) > 500){s1 = sample(table1,size = 500)}else{s1 = table1}
    x = sum(s1 >= lower.limit & s1 <= higher.limit)
    n = length(s1)
    p = sum(table2 >= lower.limit & table2 <= higher.limit) / length(table2)
    pval.tmp = prop.test(x=x,n=n,p=p,alternative = type)$p.value
    pvalues = c(pvalues,pval.tmp)
  }
  return(mean(pvalues))
}

multiple.prop.test.HCA.aggregable = function(table1,table2,type){
  pvalues = NULL
  for (i in 1:1000){
    if (length(table1) > 500){s1 = sample(table1,size = 500)}else{s1 = table1}
    x = sum(s1 > higher.limit)
    n = length(s1)
    p = sum(table2 > higher.limit) / length(table2)
    pval.tmp = prop.test(x=x,n=n,p=p,alternative = type)$p.value
    pvalues = c(pvalues,pval.tmp)
  }
  return(mean(pvalues))
}

multiple.prop.test.HCA.disorder = function(table1,table2,type){
  pvalues = NULL
  for (i in 1:1000){
    if (length(table1) > 500){s1 = sample(table1,size = 500)}else{s1 = table1}
    x = sum(s1 < lower.limit)
    n = length(s1)
    p = sum(table2 < lower.limit) / length(table2)
    pval.tmp = prop.test(x=x,n=n,p=p,alternative = type)$p.value
    pvalues = c(pvalues,pval.tmp)
  }
  return(mean(pvalues))
}

multiple.prop.test.TM.ancestral = function(table1,table2,type){
  pvalues = NULL
  for (i in 1:1000){
    if (length(table1) > 500){s1 = sample(table1,size = 500)}else{s1 = table1}
    x = sum(s1 > 0)
    n = length(s1)
    p = sum(table2 > 0) / length(table2)
    pval.tmp = prop.test(x=x,n=n,p=p,alternative = type)$p.value
    pvalues = c(pvalues,pval.tmp)
  }
  return(mean(pvalues))
}

multiple.prop.test.codons = function(table1,table2,type){
  pvals = NULL
  for (j in seq(1,1000)){
    pval.per.codon = NULL
    if (length(table1) > 500){my.sample = sample(table1,size = 500)}else{my.sample = table1}
    my.codons.1 = table(my.sample)[rownames(starts.codons.both)]
    names(my.codons.1) = rownames(starts.codons.both)
    my.codons.1[which(is.na(my.codons.1))] = 0
    my.codons.2 = table(table2)[rownames(starts.codons.both)]
    names(my.codons.2) = rownames(starts.codons.both)
    my.codons.2[which(is.na(my.codons.2))] = 0
    for (i in seq(1,nrow(starts.codons.both))){pval.per.codon = c(pval.per.codon , prop.test(x = my.codons.1[i], n = sum(my.codons.1), p= my.codons.2[i]/sum(my.codons.2),alternative = type)$p.value)}
    pvals = cbind(pvals , pval.per.codon)
  }
  rownames(pvals) = rownames(starts.codons.both)
  return(apply(pvals,1,mean))
}

multiple.prop.test.aa = function(table1,table2,type){
  pvals = NULL
  for (j in seq(1,1000)){
    pval.per.aa = NULL
    if (length(table1) > 500){my.sample = sample(table1,size = 500)}else{my.sample = table1}
    my.aa.1 = table(my.sample)[rownames(starts_both)]
    names(my.aa.1) = rownames(starts_both)
    my.aa.1[which(is.na(my.aa.1))] = 0
    my.aa.2 = table(table2)[rownames(starts_both)]
    names(my.aa.2) = rownames(starts_both)
    my.aa.2[which(is.na(my.aa.2))] = 0
    for (i in seq(1,nrow(starts_both))){pval.per.aa = c(pval.per.aa , prop.test(x = my.aa.1[i], n = sum(my.aa.1), p= my.aa.2[i]/sum(my.aa.2),alternative = type)$p.value)}
    pvals = cbind(pvals , pval.per.aa)
  }
  rownames(pvals) = rownames(starts_both)
  return(apply(pvals,1,mean))
}

residues_frequencies = function(AA,HCA,type){
  RESIDUES = NULL
  HCA.corrected = apply(as.data.frame(HCA),1,FUN=correct_sequence)
  HCA.corrected = do.call(paste, c(as.list(HCA.corrected), sep = ""))
  HCA.corrected = unlist(strsplit(HCA.corrected,split = ""))
  
  AA.seq = do.call(paste, c(as.list(AA), sep = ""))
  AA.seq = unlist(strsplit(AA.seq,split = ""))
  
   if (type == "All"){
    return( round(table(AA.seq) / length(AA.seq),3))
  }else if (type == "Clusters"){
    clusters.residues = AA.seq[which(HCA.corrected != '.')]
    return( round(table(clusters.residues) / length(clusters.residues),3))
  }else if (type == 'Linkers'){
    linkers.residues = AA.seq[which(HCA.corrected == '.')]
    return( round(table(linkers.residues) / length(linkers.residues),3))
  }
}

load.fasta.file = function(fasta){
  library(seqinr)
  library(stringr)
  my.fasta = read.fasta(fasta,as.string = T,strip.desc = T,seqtype = "AA")
  my.fasta = as.data.frame(cbind(str_split_fixed(getAnnot(my.fasta), fixed("\t"), 2)[, 1],
                                 as.character(getSequence.SeqFastaAA(my.fasta))))
  colnames(my.fasta) = c("ID","Sequence")
  return(my.fasta)
}

library("dplyr")
AA_frequencies_types = function(aa_tab){
  all_aa = do.call(paste, c(as.list(aa_tab), sep = ""))
  all_aa = gsub(pattern = "\\*",replacement = "",x = all_aa)
  all_aa_freqs = round(table(strsplit(all_aa,split = '')) / nchar(all_aa),3)
  hydophobic_freqs = sum(all_aa_freqs[names(all_aa_freqs) %in% c("V","I","L","F","M","Y","W")])
  negative_charged_freqs = sum(all_aa_freqs[names(all_aa_freqs) %in% c("D","E")])
  positive_charged_freqs = sum(all_aa_freqs[names(all_aa_freqs) %in% c("K","R")])
  hydrophilic_freqs = sum(all_aa_freqs[names(all_aa_freqs) %in% c("K","R","D","E","Q","N")])
  loop_forming_freqs =sum(all_aa_freqs[names(all_aa_freqs) %in% c( "P","G","D","N","S" )])
  groups = c(hydophobic_freqs,hydrophilic_freqs,negative_charged_freqs,positive_charged_freqs,loop_forming_freqs)
  names(groups) = c('Hydrophobic','Hydrophilic','Negative','Positive','Loops')
  return(groups)
}

codons.frequencies = function(table,codons.list="All"){
  nt.seq = do.call(paste, c(as.list(table), sep = ""))
  codons = splitseq(seq = s2c(nt.seq),frame = 0,word = 3)
  if (codons.list != "All"){
    return(round(table(codons)[codons.list] / sum(table(codons)[codons.list]),3))
  }else{
    return(round(table(codons) / sum(table(codons)),3))
  }
}

codons.z.test = function(table1,table2,type){
  nt.seq1 = do.call(paste, c(as.list(table1), sep = ""))
  codons1 = splitseq(seq = s2c(nt.seq1),frame = 0,word = 3)
  nt.seq2 = do.call(paste, c(as.list(table2), sep = ""))
  codons2 = splitseq(seq = s2c(nt.seq2),frame = 0,word = 3)
  codons2 = table(codons2)
  
  my.pvals = NULL
  my.codons = NULL
  for (time in seq(1,1000)){
    codons1.s = NULL
    while(length(codons1.s) != 64){
      codons1.s = sample(codons1,size = 1000)
      codons1.s = table(codons1.s)
    }
    my.pvals.s = NULL
    my.codons.s = NULL
    for (i in seq(1,length(codons1.s))){
      my.z = prop.test(x=codons1.s[i],n = sum(codons1.s),p=codons2[i]/sum(codons2),alternative = type)
      my.pvals.s = c(my.pvals.s,my.z$p.value)
      my.codons.s = c(my.codons.s,names(codons1.s)[i])
    }
    my.pvals = cbind(my.pvals,my.pvals.s)
  }
  rownames(my.pvals) = my.codons.s
  return(apply(my.pvals,1,mean))
}

aa.z.test = function(table1,table2,type){
  aa.seq1 = unlist(strsplit(do.call(paste, c(as.list(table1), sep = "")),split = ""))
  if (length(which(aa.seq1 == "*")) > 0){
    aa.seq1 = aa.seq1[-which(aa.seq1 == "*")]
  }
  aa.seq2 = unlist(strsplit(do.call(paste, c(as.list(table2), sep = "")),split = ""))
  if (length(which(aa.seq2 == "*")) > 0){
    aa.seq2 = aa.seq2[-which(aa.seq2 == "*")]
  }
  aa2 = table(aa.seq2)
  
  my.pvals = NULL
  my.codons = NULL
  for (time in seq(1,1000)){
    aa1.s = NULL
    while(length(aa1.s) != 20){
      aa1.s = sample(aa.seq1,size = 1000)
      aa1.s = table(aa1.s)
    }
    my.pvals.s = NULL
    my.aa.s = NULL
    for (i in seq(1,length(aa1.s))){
      my.z = prop.test(x=aa1.s[i],n = sum(aa1.s),p=aa2[i]/sum(aa2),alternative = type)
      my.pvals.s = c(my.pvals.s,my.z$p.value)
      my.aa.s = c(my.aa.s,names(aa1.s)[i])
    }
    my.pvals = cbind(my.pvals,my.pvals.s)
  }
  rownames(my.pvals) = my.aa.s
  return(apply(my.pvals,1,mean))
}

# =================== #
#  End of FUNCTIONS   #
# =================== #

# =========== #
#  CONSTANTS  #
# =========== #
near.cognate = c("ATC","ATT","ATA","ACG","AAG","AGG","CTG","GTG","TTG","ATG")
# From the paper Trifonov et al (2001) - Distinct Stages of Protein Evolution as Suggested
# by Protein Sequence Analysis
aa.age = as.data.frame(cbind(c("G","A","V","D","S","E","P","L","T","N","R","I","K","Q","C","F","H","M","Y","W"),seq(1,20)))
codon.age = as.data.frame(
  cbind(c("GGC","GCC","GTC","GAC","GGG","CCC","GGA","TCC","GAG","CTC","GGT","ACC","GTT","AAC","GCG","CGC",
          "CCG","CGG","TCG","CGA","ACG","CGT","GAT","ATC","GCT","AGC","ACT","AGT","CCT","AGG","TCT","AGA",
          "CTT","AAG","CTG","CAG","CAA","TTG","GCA","TGC","ACA","TGT","GAA","TTC","AAA","TTT","GTG","CAC",
          "CAT","ATG","GTA","TAC","CTA","TAG","ATA","TAT","AAT","ATT","TTA","TAA","CCA","TGG","TCA","TGA"),
        
         c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,
          18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32),
        
        c("G","A","V","D","G","P","G","S","E","L","G","T","V","N","A","R","P","R","S","R","T","R","D",
          "I","A","S","T","S","P","R","S","R","L","K","L","Q","Q","L","A","C","T","C","E","F","K","F",
          "V","H","H","M","V","Y","L","*","I","Y","N","I","L","*","P","W","S","*"),
  
  c("GGC","GCC","GUC","GAC","GGG","CCC","GGA","UCC","GAG","CUC","GGU","ACC","GUU","AAC","GCG","CGC",
    "CCG","CGG","UCG","CGA","ACG","CGU","GAU","AUC","GCU","AGC","ACU","AGU","CCU","AGG","UCU","AGA",
    "CUU","AAG","CUG","CAG","CAA","UUG","GCA","UGC","ACA","UGU","GAA","UUC","AAA","UUU","GUG","CAC",
    "CAU","AUG","GUA","UAC","CUA","UAG","AUA","UAU","AAU","AUU","UUA","UAA","CCA","UGG","UCA","UGA"))
  
  )

# ================== #
#  End of CONSTANTS  #
# ================== #

# ========================== #
#   Import the initial data  #
# ========================== #
info.igorf           = read.csv(paste(files.path,"Script_and_data_for_figures/inputs/Tables/IGORF.csv",sep=""),header = T)
info.cds             = read.csv(paste(files.path,"Script_and_data_for_figures/inputs/Tables/CDS.csv",sep=""),header = T)
info.disorder        = read.table(paste(files.path,'Script_and_data_for_figures/inputs/Tables/disprot_v7_protein_predictors_minsize30.tab',sep=''),header = T)
info.globular        = read.table(paste(files.path,'Script_and_data_for_figures/inputs/Tables/globular.tab',sep=''),header = T)
info.transmembrane   = read.table(paste(files.path,'Script_and_data_for_figures/inputs/Tables/Transmembrane_helices_20_nonreduntant.tab',sep=''),header = T)

# Random Genome : 
random.orfs.barcodes = load.fasta.file(paste(files.path,"Random_sequences/Random_IGR/Scer_Random.barcodes",sep = ""))
random.orfs.orfold   = read.table(paste(files.path,"Random_sequences/Random_IGR/Scer_Random.tab",sep = ""),header = T)

# ===================================== #
#   Calculate HCA clusters and Linkers  #
# ===================================== #
igorf.cluster.size       = calculate.cluster.size.list(info.igorf$HCA_sequence,corr.seq = 'yes')
igorf.linker.size        = calculate.linker.size.list(info.igorf$HCA_sequence,corr.seq = 'yes')
igorf.extremities.size   = calculate.extremities.size.list(info.igorf$HCA_sequence,corr.seq = 'yes')
cds.cluster.size         = calculate.cluster.size.list(table = info.cds$HCA_sequence,corr.seq = 'yes')
cds.linker.size          = calculate.linker.size.list(table = info.cds$HCA_sequence,corr.seq = 'yes')
cds.extremities.size     = calculate.extremities.size.list(table = info.cds$HCA_sequence,corr.seq = 'yes')
random.orfs.cluster.size = calculate.cluster.size.list(table=random.orfs.barcodes$Sequence,corr.seq = 'yes')
random.orfs.linker.size  = calculate.linker.size.list(table=random.orfs.barcodes$Sequence,corr.seq = 'yes')
random.orfs.extremities.size = calculate.extremities.size.list(table=random.orfs.barcodes$Sequence,corr.seq = 'yes')

# ============================== #
#  Data construction for GGPLOT  #
# ============================== #
# ---------------- #
#   Sequence size  #
# ---------------- #
igorfs.seq.size = as.data.frame(cbind(info.igorf$Seq_ID,info.igorf$prot_size , rep('IGORFs',nrow(info.igorf))))
colnames(igorfs.seq.size) = c("ID","size","category")
cds.seq.size = as.data.frame(cbind(info.cds$Seq_ID,info.cds$prot_size , rep('CDS',nrow(info.cds))))
colnames(cds.seq.size) = c("ID","size","category")
random.orfs.seq.size = as.data.frame(cbind(random.orfs.barcodes$ID ,nchar(random.orfs.barcodes$Sequence) , rep('Random\ngenome',nrow(random.orfs.barcodes))))
colnames(random.orfs.seq.size) = c("ID","size","category")
seq.size = rbind(igorfs.seq.size,random.orfs.seq.size,cds.seq.size)
seq.size$size = as.numeric(seq.size$size)
seq.size$category <- factor(seq.size$category, levels=unique(seq.size$category))
# ------------------ #
#   Clusters number  #
# ------------------ #
igorf.clusters.nb = lengths(igorf.cluster.size)
igorf.clusters.nb = as.data.frame(cbind(as.numeric(igorf.clusters.nb),rep("IGORFs",length(igorf.clusters.nb))))
colnames(igorf.clusters.nb) = c("size","category")
igorf.clusters.nb$size = as.numeric(igorf.clusters.nb$size)
random.orfs.clusters.nb = lengths(random.orfs.cluster.size)
random.orfs.clusters.nb = as.data.frame(cbind(as.numeric(random.orfs.clusters.nb),rep("Random\ngenome",length(random.orfs.cluster.size))))
colnames(random.orfs.clusters.nb) = c("size","category")
random.orfs.clusters.nb$size = as.numeric(random.orfs.clusters.nb$size)
cds.clusters.nb = lengths(cds.cluster.size)
cds.clusters.nb = as.data.frame(cbind(as.numeric(cds.clusters.nb),rep("CDS",length(cds.clusters.nb))))
colnames(cds.clusters.nb) = c("size","category")
cds.clusters.nb$size = as.numeric(cds.clusters.nb$size)
clusters.sizes.nb = rbind(igorf.clusters.nb,random.orfs.clusters.nb,cds.clusters.nb)
clusters.sizes.nb$category <- factor(clusters.sizes.nb$category, levels=unique(clusters.sizes.nb$category))
# ----------- #
#   Clusters  #
# ----------- #
igorf.clusters = unlist(igorf.cluster.size)
igorf.clusters = as.data.frame(cbind(as.numeric(igorf.clusters),rep("IGORFs",length(igorf.clusters))))
colnames(igorf.clusters) = c("size","category")
igorf.clusters$size = as.numeric(igorf.clusters$size)
random.orfs.clusters = unlist(random.orfs.cluster.size)
random.orfs.clusters = as.data.frame(cbind(as.numeric(random.orfs.clusters),rep("Random\ngenome",length(random.orfs.clusters))))
colnames(random.orfs.clusters) = c("size","category")
random.orfs.clusters$size = as.numeric(random.orfs.clusters$size)
cds.clusters = unlist(cds.cluster.size)
cds.clusters = as.data.frame(cbind(as.numeric(cds.clusters),rep("CDS",length(cds.clusters))))
colnames(cds.clusters) = c("size","category")
cds.clusters$size = as.numeric(cds.clusters$size)
clusters.sizes = rbind(igorf.clusters,random.orfs.clusters,cds.clusters)
clusters.sizes$category <- factor(clusters.sizes$category, levels=unique(clusters.sizes$category))
# ----------- #
#   Linkers   #
# ----------- #
igorf.linkers = c(unlist(igorf.linker.size),unlist(igorf.extremities.size))
igorf.linkers = as.data.frame(cbind(as.numeric(igorf.linkers),rep("IGORFs",length(igorf.linkers))))
colnames(igorf.linkers) = c("size","category")
igorf.linkers$size = as.numeric(igorf.linkers$size)
random.orfs.linkers = c(unlist(random.orfs.linker.size),unlist(random.orfs.extremities.size))
random.orfs.linkers = as.data.frame(cbind(as.numeric(random.orfs.linkers),rep("Random\ngenome",length(random.orfs.linkers))))
colnames(random.orfs.linkers) = c("size","category")
random.orfs.linkers$size = as.numeric(random.orfs.linkers$size)
cds.linkers = c(unlist(cds.linker.size),unlist(cds.extremities.size))
cds.linkers = as.data.frame(cbind(as.numeric(cds.linkers),rep("CDS",length(cds.linkers))))
colnames(cds.linkers) = c("size","category")
cds.linkers$size = as.numeric(cds.linkers$size)
linkers.sizes = rbind(igorf.linkers,random.orfs.linkers,cds.linkers)
linkers.sizes$category <- factor(linkers.sizes$category, levels=unique(linkers.sizes$category))


# =============== #
#    Figure 1     #
# =============== # -------------------------------------------------------------------------------------- #
#   Boxplot distributions of sequence and HCA-based structural properties of IGORFs and CDS
#   (a) sequence size (b) number of HCA clusters per sequence (c) size of HCA clusters (d) size of linkers. 
#   Asterisks denote level of significance: ***p < 0.001, see Tables S3-6 for detailed p-values.
# -------------------------------------------------------------------------------------------------------- #
library("ggpubr","ggplot2")
fig1.colors = c('darkorchid3','darkorange2')

p.seq.size = ggplot(data = seq.size[-which(seq.size$category == "Random\ngenome"),],aes(x=category, y=size,fill=category))+geom_violin(trim=T,color=NA)+
  scale_fill_manual(values=adjustcolor(fig1.colors ,alpha.f = 0.3))+
  ylim(0,400)  + geom_boxplot(outlier.size = 0.0,width=0.1,col=fig1.colors ,fill=adjustcolor(fig1.colors ,alpha.f = 0.5)) + 
  theme_minimal()+ theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),
                         axis.title=element_text(size=9,face="plain"),axis.text=element_text(size=10)) + 
  ylab('Number of residues') + xlab("") + ggtitle("Sequence size")+
  annotate(geom="text",x=1.5, y=300,size=5, label= asteriscs.multiple.wilcox.test(tableA = info.igorf$prot_size, 
                                                                                   tableB = info.cds$prot_size, 
                                                                                   type = "l"))

p.clust.nb  = ggplot(data = clusters.sizes.nb[-which(clusters.sizes.nb$category  == "Random\ngenome"),],aes(x=category, y=size,fill=category))+
  geom_violin(trim=T,color=NA)+ylim(0,40) + scale_fill_manual(values=adjustcolor(fig1.colors ,alpha.f = 0.3)) + 
  geom_boxplot(outlier.size = 0.0,width=0.1,col=fig1.colors ,fill=adjustcolor(fig1.colors ,alpha.f = 0.5)) + 
  theme_minimal()+ theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),
                         axis.title=element_text(size=9,face="plain"),axis.text=element_text(size=10))  + 
  ylab('Number of clusters') + xlab("") + ggtitle("Cluster number")+
  annotate(geom="text",x=1.5, y=30,size=5, label= asteriscs.multiple.wilcox.test(tableA = lengths(igorf.cluster.size), 
                                                                                   tableB = lengths(cds.cluster.size)  , 
                                                                                   type = "l"))

p.clust.sz  = ggplot(data = clusters.sizes[-which(clusters.sizes$category == "Random\ngenome"),],aes(x=category, y=size,fill=category))+
  geom_violin(trim=T,color=NA)+ylim(0,40) + scale_fill_manual(values=adjustcolor(fig1.colors ,alpha.f = 0.3)) + 
  geom_boxplot(outlier.size = 0.0,width=0.1,col=fig1.colors ,fill=adjustcolor(fig1.colors ,alpha.f = 0.5)) + 
  theme_minimal()+ theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),
                         axis.title=element_text(size=9,face="plain"),axis.text=element_text(size=10)) + 
  ylab('Number of residues') + xlab("") + ggtitle("Cluster size")+
  annotate(geom="text",x=1.5, y=30,size=5, label= asteriscs.multiple.wilcox.test(tableA = unlist(igorf.cluster.size), 
                                                                                  tableB = unlist(cds.cluster.size)  , 
                                                                                  type = "l"))

p.link.sz   = ggplot(data = linkers.sizes[-which(linkers.sizes$category == "Random\ngenome"),],aes(x=category, y=size, fill = category))+geom_violin(trim=T,color=NA)+ylim(0,40) + 
  scale_fill_manual(values=adjustcolor(fig1.colors ,alpha.f = 0.3))+ 
  geom_boxplot(outlier.size = 0.0,width=0.1,col=fig1.colors ,fill=adjustcolor(fig1.colors ,alpha.f = 0.5)) + 
  theme_minimal() + ylab('Number of residues') + xlab("") + ggtitle("Linker size")+
  theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain"),axis.text=element_text(size=10)) + 
  annotate(geom="text",x=1.5, y=30,size=5, label=asteriscs.multiple.wilcox.test(tableA = c(unlist(igorf.linker.size),unlist(igorf.extremities.size)), 
                                                                                 tableB = c(unlist(cds.linker.size),unlist(cds.extremities.size))  , 
                                                                                type = "l"))

pdf(paste(files.path,'Script_and_data_for_figures/outputs/Papadopoulos_Fig1.pdf',sep=""),height = 2.5,width = 8)
annotate_figure( 
  ggarrange(p.seq.size,p.clust.nb,p.clust.sz,p.link.sz,ncol = 4,nrow = 1,labels = 'AUTO',font.label = list(size=12,face = "bold"))
  ,top = text_grob("Figure 1", color = "black",hjust = 1.2, x = 1, face = "plain", size = 10)
)
dev.off()
# ==================== #
#   END of Figure 1    #
# ==================== #

print(paste(round(sum(lengths(igorf.cluster.size) != 0)/nrow(info.igorf)*100,2),"% of IGORFs harbor at least one HCA cluster."))
print(paste("Mean size of IGORF linkers is:",round(mean(c(unlist(igorf.linker.size),unlist(igorf.extremities.size))),1)))
print(paste("Mean size of CDS linkers is:",round(mean(c(unlist(cds.linker.size),unlist(cds.extremities.size))),1)))

# ================ #
#    Figure S2     #
# ================ # ---------------------------------------------------------- #
# Random IGORFs behave similarly to real IGORFs for most properties
# Boxplot distributions of sequence and HCA-based structural properties 
# of real IGORFs and random IGORFs (A) sequence size (B) number of HCA 
# clusters per sequence (C) size of HCA clusters (D) size of linkers. 
# Asterisks denote level of significance: *p < 0.05, **p < 0.01, ***p < 0.001
# ----------------------------------------------------------------------------- #
library("ggpubr","ggplot2")
figS1.colors = c('darkorchid3','darkturquoise')

p.seq.size = ggplot(data = seq.size[-which(seq.size$category == "CDS"),],aes(x=category, y=size,fill=category))+geom_violin(trim=T,color=NA)+scale_fill_manual(values=adjustcolor(figS1.colors ,alpha.f = 0.3))+
  ylim(0,400)  + geom_boxplot(outlier.size = 0.0,width=0.1,col=figS1.colors ,fill=adjustcolor(figS1.colors ,alpha.f = 0.3)) + 
  theme_minimal() + ylab('Number of residues') + xlab("") + ggtitle("Sequence size")+
  theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain"),axis.text=element_text(size=10)) + 
  annotate(geom="text",x=1.5, y=300,size=5, label= asteriscs.multiple.wilcox.test(tableA =seq.size$size[which(seq.size$category  == "Random\ngenome")], 
                                                                                  tableB = info.igorf$prot_size, 
                                                                                  type = "l"))

p.clust.nb  = ggplot(data = clusters.sizes.nb[-which(clusters.sizes.nb$category == "CDS"),],aes(x=category, y=size,fill=category))+geom_violin(trim=T,color=NA)+ylim(0,40) + scale_fill_manual(values=adjustcolor(figS1.colors ,alpha.f = 0.3)) + 
  geom_boxplot(outlier.size = 0.0,width=0.1,col=figS1.colors ,fill=adjustcolor(figS1.colors ,alpha.f = 0.3)) + 
  theme_minimal() + ylab('Number of clusters') + xlab("") + ggtitle("Cluster number")+
  theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain"),axis.text=element_text(size=10)) + 
  annotate(geom="text",x=1.5, y=30,size=5, label= asteriscs.multiple.wilcox.test(tableA = lengths(igorf.cluster.size), 
                                                                                 tableB = clusters.sizes.nb$size[which(clusters.sizes.nb$category == "Random\ngenome")]  , 
                                                                                 type = "g"))

p.clust.sz  = ggplot(data = clusters.sizes[-which(clusters.sizes$category == "CDS"),],aes(x=category, y=size,fill=category))+geom_violin(trim=T,color=NA)+ylim(0,40) + scale_fill_manual(values=adjustcolor(figS1.colors ,alpha.f = 0.3)) + 
  geom_boxplot(outlier.size = 0.0,width=0.1,col=figS1.colors ,fill=adjustcolor(figS1.colors ,alpha.f = 0.3)) + 
  theme_minimal()+ ylab('Number of residues') + xlab("") + ggtitle("Cluster size")+
  theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain"),axis.text=element_text(size=10)) + 
  annotate(geom="text",x=1.5, y=30,size=5, label= asteriscs.multiple.wilcox.test(tableA = unlist(igorf.cluster.size), 
                                                                                 tableB = clusters.sizes$size[which(clusters.sizes$category == "Random\ngenome")]  , 
                                                                                 type = "two.sided"))

p.link.sz   = ggplot(data = linkers.sizes[-which(linkers.sizes$category == "CDS"),],aes(x=category, y=size, fill = category))+geom_violin(trim=T,color=NA)+ylim(0,40) + scale_fill_manual(values=adjustcolor(figS1.colors ,alpha.f = 0.3))+ 
  geom_boxplot(outlier.size = 0.0,width=0.1,col=figS1.colors ,fill=adjustcolor(figS1.colors ,alpha.f = 0.3)) + 
  theme_minimal() + ylab('Number of residues') + xlab("") + ggtitle("Linker size")+
  theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain"),axis.text=element_text(size=10)) + 
  annotate(geom="text",x=1.5, y=30,size=5, label=asteriscs.multiple.wilcox.test(tableA = linkers.sizes$size[which(linkers.sizes$category == "IGORFs")] , 
                                                                                tableB = linkers.sizes$size[which(linkers.sizes$category == "Random\ngenome")]   , 
                                                                                type = "l"))

pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S2.pdf',sep=""),height = 2.5,width = 8)
annotate_figure( 
  ggarrange(p.seq.size,p.clust.nb,p.clust.sz,p.link.sz,ncol = 4,nrow = 1,labels = 'AUTO',font.label = list(size=12,face = "bold"))
  #,bottom = text_grob("Supplemental Figure 2", color = "black",hjust = 1.5, x = 1, face = "plain", size = 8)
)
dev.off()
# ===================== #
#   END of Figure S1    #
# ===================== #


# --------------------------------------------- #
#  We calculate the values of the bins' limits  #
# --------------------------------------------- #
# The limit between central and right bin 97.5% of globular:
higher.limit = sort(info.globular$HCA)[round(length(info.globular$HCA)*0.975)]
# The limit between central and left bin 2.5% of globular:
lower.limit  = sort(info.globular$HCA)[round(length(info.globular$HCA)*0.025)]


# =============== #
#    Figure 2     #
# =============== # -------------------------------------------------------------------------- #
# IGORFs encompass the large spectrum of fold potential of canonical proteins 
# (A) Distribution of the HCA scores for the three reference datasets (i.e. 
# disordered regions, globular domains and transmembrane regions - green, black 
# and pink curves respectively) along with those for the CDS (orange curve) and 
# IGORFs (purple curve). There is a clear distinction between the distributions 
# of HCA scores calculated for the three reference datasets. 
# (Two-sided Kolmogorov Smirnov test, P < 2e-16 for all comparisons). Dotted black 
# lines delineate the boundaries of the low, intermediate and high HCA score categories 
# reflecting the three categories of fold potential (i.e. disorder prone, foldable, 
# or aggregation prone in solution). The boundaries are defined so that 95% of globular 
# domains fall into the intermediate HCA score category whereas the low and high HCA 
# score categories include all sequences with HCA values that are lower or higher than 
# those of 97.5% of globular domains respectively. High HCA scores reflect sequences 
# with high densities in HCA clusters that are likely to form aggregates in solution. 
# Low HCA scores indicate sequences with high propensities for disorder, while intermediate 
# scores correspond to globular proteins characterized by an equilibrium of hydrophobic and 
# hydrophilic residues (see Methods for more details). The percentages of sequences in each 
# category are given for all datasets. Raw data distributions are presented in Fig S9. 
# (B) Aggregation and disorder propensities calculated with TANGO and IUPred respectively 
# are given for CDS and IGORFs in each foldability HCA score category.
# -------------------------------------------------------------------------------------------- #
pdf(paste(files.path,'Script_and_data_for_figures/outputs/Papadopoulos_Fig2.pdf',sep=""),height = 6,width = 5)

layout(matrix(c(1,1,1,1,1,1,1,1,1,2,2,2,3,3,3),nrow=5,ncol=3,byrow=T))
par(mar=c(6,4,3,2))
plot(density(info.globular$HCA,bw = 0.7),ylim=c(0,0.40),xlim=c(-10,10),lwd=2,main='',
     xlab = 'HCA score',cex.lab=1.2,frame.plot=F,yaxt='n',col=adjustcolor("white",alpha.f = 0))
polygon(density(info.disorder$HCA,bw=0.9),col=adjustcolor('forestgreen',alpha.f = 0.2),border = F)
polygon(density(info.transmembrane$HCA,bw=0.8),col=adjustcolor('mediumvioletred',alpha.f = 0.2),border = F)
polygon(density(info.globular$HCA,bw = 0.7),col=adjustcolor('black',alpha.f = 0.3),border = F)
lines(density(info.cds$HCA_score,bw=0.9),lwd=2,col='darkorange2',lty=2)
lines(density(info.igorf$HCA_score,bw = 0.7),lwd=2,col="darkorchid3",lty=2)

legend(x=-3.5,y=0.43,
       c('Globular Regions','Disordered Regions','Transmembrane Regions'),
       fill =c(adjustcolor(c('black','forestgreen','mediumvioletred'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=0.9,horiz = F,ncol = 1)
legend(x=3.5,y=0.43,
       c('CDS','IGORFs'),
       col=c('darkorange2','darkorchid3'),
       #lty = c(1,1,1,1,1,2),
       lty = c(2),
       lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=0.9,horiz = F,ncol = 1)
axis(side = 2,labels = c('0','0.1','0.2','0.3'),at=c(0,0.1,0.2,0.3),las=2)
segments(x0 = lower.limit,y0=0,x1= lower.limit, y1=0.31,lty=3)
segments(x0 = higher.limit,y0=0,x1= higher.limit, y1=0.31,lty=3)

text(x=1,y=0.33,labels ='Intermediate HCA',cex=1.2)
legend(x=-0.5,y=0.32,
       c('95.0%','2.5%','29.9%',
         paste(round(sum(info.cds$HCA_bin == "intermediate") / nrow(info.cds) * 100,1),'%',sep=""),
         paste(round(sum(info.igorf$HCA_bin == "intermediate") / nrow(info.igorf) * 100,1),'%',sep="")),
       fill =c(adjustcolor(c('black','forestgreen','mediumvioletred'),alpha.f = 0.3),adjustcolor(c('darkorange2','darkorchid3'),alpha.f = 1)),
       col = c('black','forestgreen','mediumvioletred','darkorange2','darkorchid3'),
       text.col = c('black','forestgreen','mediumvioletred','darkorange2','darkorchid3'),
       border = 0,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=0.9,horiz = F,ncol = 1,x.intersp =0.5)

text(x=8.3,y=0.33,labels = 'High HCA',cex=1.2)
legend(x=7,y=0.32,
       c('2.5%','0%','69.2%',
         paste(round(sum(info.cds$HCA_bin == "high") / nrow(info.cds) * 100,1),'%',sep=""),
         paste(round(sum(info.igorf$HCA_bin == "high") / nrow(info.igorf) * 100,1),'%',sep="")),
       fill =c(adjustcolor(c('black','forestgreen','mediumvioletred'),alpha.f = 0.3),adjustcolor(c('darkorange2','darkorchid3'),alpha.f = 1)),
       col = c('black','forestgreen','mediumvioletred','darkorange2','darkorchid3'),
       text.col = c('black','forestgreen','mediumvioletred','darkorange2','darkorchid3'),
       border = 0,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=0.9,horiz = F,ncol = 1,x.intersp =0.5)

text(x=-7,y=0.33,labels = 'Low HCA',cex=1.2)
legend(x=-8.5,y=0.32,
       c('2.5%','97.5%','0.9%',
         paste(round(sum(info.cds$HCA_bin == "low") / nrow(info.cds) * 100,1),'%',sep=""),
         paste(round(sum(info.igorf$HCA_bin == "low") / nrow(info.igorf) * 100,1),'%',sep="")),
       fill =c(adjustcolor(c('black','forestgreen','mediumvioletred'),alpha.f = 0.3),adjustcolor(c('darkorange2','darkorchid3'),alpha.f = 1)),
       col = c('black','forestgreen','mediumvioletred','darkorange2','darkorchid3'),
       text.col = c('black','forestgreen','mediumvioletred','darkorange2','darkorchid3'),
       border = 0,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=0.9,horiz = F,ncol = 1,x.intersp =0.5)

mtext(text = expression(bold("A")),cex =1,side=2,at=0.42,line=2.5,las=2)
mtext(text = "Figure 2",cex = 0.7,side=3,at=10,line=1.5,las=1)
# ------------------ #
#     TANGO plot     #
# ------------------ #
par(mar=c(2,4,0.5,2))

boxplot(
  info.igorf$aggregation_propensity[which(info.igorf$HCA_bin == "low")],
  info.cds$aggregation_propensity[which(info.cds$HCA_bin == "low")],NaN,
  info.igorf$aggregation_propensity[which(info.igorf$HCA_bin == "intermediate")],
  info.cds$aggregation_propensity[which(info.cds$HCA_bin == "intermediate")],
  NaN,
  info.igorf$aggregation_propensity[which(info.igorf$HCA_bin == "high")],
  info.cds$aggregation_propensity[which(info.cds$HCA_bin == "high")],
  
  ylim=c(0,1),frame=F,
  border=c('darkorchid3','darkorange2','blue'),
  col=adjustcolor(c('darkorchid3','darkorange2','blue'),alpha.f = 0.3),
  yaxt='n',xaxt='n',pch=20,notch = T
)
title(ylab='Aggregation\npropensity',cex.lab=0.9,font.lab=2,line=2.2)
axis(side=2,labels = c('0','','0.5','','1'),at=c(0,0.25,0.5,0.75,1),las=2)
axis(side=1,labels = c('','Low HCA','Intermediate HCA','High HCA',''),at=c(0.5,1.5,4.5,7.5,8.5))
legend(x=1.5,y = 1,legend = c("IGORFs","CDS"),
       fill=adjustcolor(c('darkorchid3','darkorange2'),alpha.f = 0.3),
       border = c('darkorchid3','darkorange2'),
       box.col = 0,
       box.lty = 0,
       box.lwd = 0
)

mtext(text = expression(bold("B")),cex = 1,side=2,at=1.2,line=2.5,las=2)
# ------------------ #
#     IuPRED plot    #
# ------------------ # 
boxplot(
  info.igorf$disorder_propensity[which(info.igorf$HCA_bin == "low")],
  info.cds$disorder_propensity  [which(info.cds$HCA_bin   == "low")],
  NaN,
  info.igorf$disorder_propensity[which(info.igorf$HCA_bin == "intermediate")],
  info.cds$disorder_propensity  [which(info.cds$HCA_bin   == "intermediate")],
  NaN,
  info.igorf$disorder_propensity[which(info.igorf$HCA_bin == "high")],
  info.cds$disorder_propensity  [which(info.cds$HCA_bin   == "high")],
  ylim=c(0,1),frame=F,
  border=c('darkorchid3','darkorange2','blue'),
  col=adjustcolor(c('darkorchid3','darkorange2','blue'),alpha.f = 0.3),
  
  yaxt='n',xaxt='n',pch=20,notch = T
)
title(ylab='Disorder\npropensity',cex.lab=0.9,font.lab=2,line=2.2)
axis(side=2,labels = c('0','','0.5','','1'),at=c(0,0.25,0.5,0.75,1),las=2)
axis(side=1,labels = c('','Low HCA','Intermediate HCA','High HCA',''),at=c(0.5,1.5,4.5,7.5,8.5))

dev.off()
# ==================== #
#   END of Figure 2    #
# ==================== #

print(paste(  
  round(sum(info.cds$HCA_bin == "high" & info.cds$TM_number > 0) / sum(info.cds$HCA_bin == "high")*100,1),
  "% of high HCA score category CDS are predicted as containing at least one TM domain"))

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #
#  Ancestral Reconstruction of de novo genes
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #

# Here we read the table of the information from all the ancIGORFs reconstructed:
ancestral.igorf = read.csv(paste(files.path,"Script_and_data_for_figures/inputs/Tables/AncIGORF.csv",sep=""),header = T)
# Here we keep the information ONLY of the de novo genes reconstructed
de.novo.info    = info.cds[match(unique(ancestral.igorf$gene_name),info.cds$Gene),]

single.ancestors   = ancestral.igorf[which(ancestral.igorf$gene_type == "single"),]
multiple.ancestors = ancestral.igorf[which(ancestral.igorf$gene_type == "multiple"),]
foldable.denovo    = ancestral.igorf[which(ancestral.igorf$gene_HCA_score >= lower.limit & ancestral.igorf$gene_HCA_score <= higher.limit & ancestral.igorf$gene_type == "multiple"),]

at.least.one.foldable = function(table){
  foldable.anc = NULL
  for (i in unique(table$gene_name)){
    my.candidates = which(table$gene_name == i)
    if (length(which(table$ancIGORF_HCA_score[my.candidates] > lower.limit & table$ancIGORF_HCA_score[my.candidates] < higher.limit )) > 0){ foldable.anc = c(foldable.anc,i)}
  }
  return(foldable.anc)
}

print(paste(round(sum(de.novo.info$HCA_bin[match(unique(multiple.ancestors$gene_name),de.novo.info$Gene)] == "intermediate") / length(de.novo.info$HCA_bin[match(unique(multiple.ancestors$gene_name),de.novo.info$Gene)]),2)*100,"% of multiple-ancIGORFs de novo genes are foldable"))
print(paste(round(length(at.least.one.foldable(ancestral.igorf)) / length(unique(ancestral.igorf$gene_name)),2)*100,'% of de novo genes have at least ONE foldable ancIGORF'))
print(paste(round(length(at.least.one.foldable(multiple.ancestors)) / length(unique(multiple.ancestors$gene_name)),2)*100,"% of multiple-ancIGORFs de novo genes have at least ONE foldable ancIGORF"))
print(paste(round(length(at.least.one.foldable(single.ancestors)) / length(unique(single.ancestors$gene_name)),2)*100,"% of single-ancIGORFs de novo genes have at least ONE foldable ancIGORF"))
print(paste(round(length(at.least.one.foldable(foldable.denovo)) / length(unique(foldable.denovo$gene_name)),2)*100,"% of foldable de novo genes have at least ONE foldable ancIGORF"))


# =============== #
#    Figure 3     #
# =============== # ------------------------------------------------------------------------- #
#From ancIGORFs to de novo genes. Plot of the HCA score of each ancIGORF 
# (black and white points for single and multiple ancIGORFs respectively) 
# along with its corresponding de novo gene (blue points). Each de novo gene 
# is connected to its parent ancIGORF(s) with a colored line. One should 
# notice that a de novo gene is connected to several IGORFs when it results 
# from the combination of different ancestral IGORFs (i.e. multiple-ancIGORF de novo genes). 
# Green lines indicate cases where a de novo gene is connected to a low HCA score ancIGORF, 
# while grey and pink lines indicate connections with an intermediate and a high HCA score 
# ancIGORFs, respectively. The HCA score densities of de novo genes and ancIGORFs are shown 
# in grey (bottom and top of the graph respectively).
# -------------------------------------------------------------------------------------------- #
pdf(paste(files.path,'Script_and_data_for_figures/outputs/Papadopoulos_Fig3.pdf',sep=""),height = 4,width = 8)
par(mar=c(5,10,4,4))
plot(ancestral.igorf$gene_HCA_score,rep(1,length(ancestral.igorf$gene_name)),
     xlim=c(-10,10),frame=F,yaxt='n',ylim=c(0.9,1.3),ylab='',col=adjustcolor('white',alpha.f = 0),
     xlab='HCA score',cex.lab=1,cex.main=1,cex.axis=1)

# Here we select the color of each conection line based on the
# inital HCA score of the ancIGORF
for (i in 1:nrow(ancestral.igorf)){
  if (ancestral.igorf$ancIGORF_HCA_score[i] < lower.limit){
    my_col='forestgreen'
  }else if(ancestral.igorf$ancIGORF_HCA_score[i] > higher.limit){my_col='mediumvioletred'}
  else{my_col='dimgrey'}
    segments(x0=ancestral.igorf$gene_HCA_score[i],y0=1,
             x1=ancestral.igorf$ancIGORF_HCA_score[i],y1=1.2,
             col=adjustcolor(my_col,alpha.f = 0.4),lwd=1)
}
# -------------------------------------------- #
dens = density(ancestral.igorf$ancIGORF_HCA_score)
polygon(dens$x,1.205+dens$y*0.5,col=adjustcolor("cadetblue4",alpha.f = 0.3),border = F)
dens = density(ancestral.igorf$gene_HCA_score[match(unique(ancestral.igorf$gene_name),ancestral.igorf$gene_name)])
polygon(dens$x,0.995-dens$y*0.5,col=adjustcolor("cadetblue4",alpha.f = 0.3),border = F)

points(multiple.ancestors$ancIGORF_HCA_score,rep(1.2,length(multiple.ancestors$gene_name)),
       xlim=c(-10,10),pch=21,col="black",bg="white",cex=0.7)
points(single.ancestors$ancIGORF_HCA_score,rep(1.2,length(single.ancestors$gene_name)),
       xlim=c(-10,10),pch=20,col='black',cex=1)
points(ancestral.igorf$gene_HCA_score,rep(1,length(ancestral.igorf$gene_name)),
       xlim=c(-10,10),pch=20,col=adjustcolor('royalblue',alpha.f = 1),cex=0.7)

axis(side = 2,labels = c("de novo genes",'ancIGORFs'),at=c(1,1.2),las=2,line=0,cex.axis=1)
abline(v=lower.limit,lty=3)
abline(v=higher.limit,lty=3)

legend("bottomleft",legend = c("ancIGORFs (Multiple)","ancIGORFs (Single)","de novo genes"),pch = c(21,21,16),col=c("black","black","royalblue"),pt.bg = c("white","black","royalblue") ,bty = "n",box.lty = 0,box.col = 0,pt.cex = 0.7,cex=0.7,bg=F)

text(x = c(-6.7,1,8),y=c(1.3),labels = c("Low HCA","Intermediate HCA","High HCA"),cex=1)
mtext(text = "Figure 3",cex = 0.8,side=3,at=10,line=1.5,las=1)
dev.off()
# ==================== #
#   END of Figure 3    #
# ==================== #

print(paste("ancIGORFs vs IGORFs distribution of HCA scores ---> KS-test:",
multiple.ks.test(table1 = ancestral.igorf$ancIGORF_HCA_score,table2 = info.igorf$HCA_score,type = 'two.sided')
))


print(paste(round(sum(ancestral.igorf$ancIGORF_HCA_score >= lower.limit & ancestral.igorf$ancIGORF_HCA_score <= higher.limit) / nrow(ancestral.igorf) *100,1),"% foldable ancIGORFs"))

print(paste("ancIGORFs vs IGORFs proportion of foldable ---> prop-test:",
            multiple.prop.test.HCA.foldable(table1 = ancestral.igorf$ancIGORF_HCA_score,table2 = info.igorf$HCA_score,type = 'g')
))

print(paste(round(sum(ancestral.igorf$ancIGORF_HCA_score > higher.limit) / nrow(ancestral.igorf) *100,1),"% aggregable ancIGORFs"))

print(paste("ancIGORFs vs IGORFs proportion of aggregable ---> prop-test:",
            round(multiple.prop.test.HCA.aggregable(table1 = ancestral.igorf$ancIGORF_HCA_score,table2 = info.igorf$HCA_score,type = 'l'),5)
))


# ------------------------- #
#  Amino acid frequencies   #
# ------------------------- #
# We read the fasta files:
cds.fasta   = load.fasta.file(fasta=paste(files.path,'Script_and_data_for_figures/inputs/pfasta/Scer_CDS.pfasta',sep=""))
cds.fasta = cds.fasta[match(info.cds$Gene,cds.fasta$ID),]
igorf.fasta = load.fasta.file(fasta=paste(files.path,'Script_and_data_for_figures/inputs/pfasta/Scer_IGORF.pfasta',sep=""))
igorf.fasta = igorf.fasta[match(info.igorf$Seq_ID,igorf.fasta$ID),]
# We calculate the amino acid frequencies for all categories: 
CDS.freqs.residues = residues_frequencies(AA = gsub("\\*", "",  cds.fasta$Sequence) , HCA = info.cds$HCA_sequence , type= "All")
CDS.freqs.residues = CDS.freqs.residues[  !names(CDS.freqs.residues) %in% c("X")]
CDS.freqs.cluster  = residues_frequencies(AA = gsub("\\*", "",  cds.fasta$Sequence) , HCA = info.cds$HCA_sequence , type= "Clusters")
CDS.freqs.cluster  = CDS.freqs.cluster[  !names(CDS.freqs.cluster) %in% c("X")]
CDS.freqs.linker   = residues_frequencies(AA = gsub("\\*", "",  cds.fasta$Sequence) , HCA = info.cds$HCA_sequence , type= "Linkers")
CDS.freqs.linker   = CDS.freqs.linker[  !names(CDS.freqs.linker) %in% c("X")]
igorf.freqs.residues = residues_frequencies(AA = gsub("\\*", "",  igorf.fasta$Sequence) , HCA = info.igorf$HCA_sequence , type= "All")
igorf.freqs.residues = igorf.freqs.residues[  !names(igorf.freqs.residues) %in% c("X")]
igorf.freqs.cluster  = residues_frequencies(AA = gsub("\\*", "",  igorf.fasta$Sequence)  , HCA = info.igorf$HCA_sequence , type= "Clusters")
igorf.freqs.cluster = igorf.freqs.cluster[  !names(igorf.freqs.cluster) %in% c("X")]
igorf.freqs.linker   = residues_frequencies(AA = gsub("\\*", "",  igorf.fasta$Sequence)  , HCA = info.igorf$HCA_sequence , type= "Linkers")
igorf.freqs.linker = igorf.freqs.linker[  !names(igorf.freqs.linker) %in% c("X")]
de.novo.freqs.residues = residues_frequencies(AA = gsub("\\*", "",  cds.fasta$Sequence[match(de.novo.info$Gene,cds.fasta$ID)]), HCA = info.cds$HCA_sequence[match(de.novo.info$Gene,info.cds$Gene)] , type= "All")
de.novo.freqs.residues = de.novo.freqs.residues[  !names(de.novo.freqs.residues) %in% c("X")]
de.novo.freqs.cluster  = residues_frequencies(AA = gsub("\\*", "",  cds.fasta$Sequence[match(de.novo.info$Gene,cds.fasta$ID)]), HCA = info.cds$HCA_sequence[match(de.novo.info$Gene,info.cds$Gene)] , type= "Clusters")
de.novo.freqs.cluster  = de.novo.freqs.cluster[  !names(de.novo.freqs.cluster) %in% c("X")]
de.novo.freqs.linker   = residues_frequencies(AA = gsub("\\*", "",  cds.fasta$Sequence[match(de.novo.info$Gene,cds.fasta$ID)]), HCA = info.cds$HCA_sequence[match(de.novo.info$Gene,info.cds$Gene)] , type= "Linkers")
de.novo.freqs.linker  = de.novo.freqs.linker[  !names(de.novo.freqs.linker) %in% c("X")]
ancestral.freqs.cluster = residues_frequencies(AA = ancestral.igorf$ancIGORF_aa_sequence,HCA=ancestral.igorf$ancIGORF_HCA_sequence, type= "Clusters")
ancestral.freqs.residues = residues_frequencies(AA = ancestral.igorf$ancIGORF_aa_sequence,HCA=ancestral.igorf$ancIGORF_HCA_sequence, type= "All")

# =============== #
#    Figure 4     #
# =============== # ------------------------------------------------------------------------------ #
# (A) Radar plot reflecting the 20 amino acid frequencies of IGORFs, ancIGORFs, de novo genes and CDS. 
# (B) Plot of the HCA score of each de novo gene with those of its parent ancIGORF(s). 
# The fold potential of a single ancIGORF de novo gene is mostly determined by the one of its parent 
# ancIGORFs while the combination of several ancIGORFs through frameshift events and STOP codon mutations 
# lead most of the time to a foldable product. Single ancIGORF and multiple ancIGORFs de novo genes 
# are represented by black and white points respectively. The contour lines of the density function 
# of the relationships between single and multiple ancIGORFs de novo genes HCA scores versus the 
# score of their parent ancIGORF(s) are represented in black and grey respectively. The Spearman rank 
# correlation coefficients and the corresponding p-values are indicated on the plot. The blue region 
# indicates de novo genes encoding proteins predicted as foldable. 
# ------------------------------------------------------------------------------------------------ #
pdf(paste(files.path,'Script_and_data_for_figures/outputs/Papadopoulos_Fig4.pdf',sep=""),height = 4,width = 8)
par(mfrow=c(1,2))
# ===== #
#   A   #
# ===== #
par(mar=c(3,1,3,1))
library(fmsb)
library("MASS")
freqs.mat.cluster = rbind(rep(x = 0.12,20),rep(x=0,20),CDS.freqs.residues,igorf.freqs.residues,de.novo.freqs.residues,ancestral.freqs.residues)
colnames(freqs.mat.cluster) = names(CDS.freqs.residues)
row.names(freqs.mat.cluster) = c("max","min","freqCDS","freqIGORF","freqDN","freqANC")
radarchart(as.data.frame(freqs.mat.cluster),seg = 2,maxmin = T,axistype = 1,pcol = adjustcolor(c("darkorange2","darkorchid2","royalblue","grey"),alpha.f = 1),
           cglcol = "grey",caxislabels = seq(0,0.20,0.06),axislabcol = "grey",cglty = 3,pty = c(32,32,1,20),calcex=0.8,
           title = "Amino acids frequencies",plty=c(1,1,0,0),cex.main=0.8
           #pfcol = adjustcolor(c("darkorange2","darkorchid2"),alpha.f = 0.2)
)
legend(x=0.8,y=-1,inset=c(-0.2,0),xpd=TRUE,legend = c("IGORFs","CDS","ancIGORFs","de novo genes"),
       col = c("darkorchid2","darkorange2","grey","royalblue"),pch=c(-1,-1,20,1),
       bg=F,box.lty = 0,box.lwd = 0,cex = 0.7,lty=c(1,1,-1,-1)
)

mtext(text = expression(bold("A")),cex = 1,side = 2,las=2,at = 1.65,adj = 0)

# ===== #
#   B   #
# ===== #
par(mar=c(5,4.5,3,2))
myboot <- function(x1,y1) {
  tmpdata <- data.frame(x1 = x1, y1 = y1)
  thisboot <- sample(1:nrow(tmpdata), nrow(tmpdata), TRUE)
  coef(lm(y1 ~ x1, data = tmpdata[thisboot, ]))
}

plot(ancestral.igorf$ancIGORF_HCA_score[which(ancestral.igorf$gene_type == "single")],ancestral.igorf$gene_HCA_score[which(ancestral.igorf$gene_type == "single")],
     ylim = c(-10,10),pch=21,xlim=c(-10,10),
     ylab='de novo genes HCA score',xlab='ancIGORFs HCA score',frame=F,
     col="white",bg='black',cex=0.7,cex.main=0.8,yaxt="n",cex.lab=0.9)
polygon(x = c(-12,-12,12,12),y=c(lower.limit,higher.limit,higher.limit,lower.limit),col = adjustcolor('coral',alpha.f = 0.3),lty = 0)

# We add the conturing densities
contouring = kde2d(ancestral.igorf$ancIGORF_HCA_score[which(ancestral.igorf$gene_type == "multiple")],ancestral.igorf$gene_HCA_score[which(ancestral.igorf$gene_type == "multiple")] ,n = 100)
contour(contouring, drawlabels=FALSE, nlevels=8, col='grey70', add=TRUE,lwd = 1,lty=1.5)
contouring = kde2d(ancestral.igorf$ancIGORF_HCA_score[which(ancestral.igorf$gene_type == "single")],ancestral.igorf$gene_HCA_score[which(ancestral.igorf$gene_type == "single")] ,n = 50)
contour(contouring, drawlabels=FALSE, nlevels=2, col='black', add=TRUE,lwd = 1,lty=1.5)

abline(v=c(higher.limit,lower.limit),h=c(higher.limit,lower.limit),lty=3,lwd=1,col='black')
points(ancestral.igorf$ancIGORF_HCA_score[which(ancestral.igorf$gene_type == "multiple")],ancestral.igorf$gene_HCA_score[which(ancestral.igorf$gene_type == "multiple")],col="grey30",bg="white",pch=21,cex=0.4,lwd=0.5)
points(ancestral.igorf$ancIGORF_HCA_score[which(ancestral.igorf$gene_type == "single")],ancestral.igorf$gene_HCA_score[which(ancestral.igorf$gene_type == "single")],col="black",pch=16,cex=0.6)
axis(side = 2,at=c(-10,-5,0,5,10),labels = c("-10","-5","0","5","10"),las=2)
legend("topleft",legend = c("ancIGORFs (Multiple)","ancIGORFs (Single)"),col = c("grey70","grey20"),lty=1,lwd=1,box.lwd = 0,box.lty = 0,cex=0.6)
legend("topleft",legend = c("",""),col = c("black","black"),pt.bg = c("white","black"),pch=c(21,21),bty = "n",box.lwd = 0,box.lty = 0,cex=0.6)

text(x = 5.7,y=-5.5,labels = paste("Rho:",round(cor(ancestral.igorf$ancIGORF_HCA_score[which(ancestral.igorf$gene_type == "single")],ancestral.igorf$gene_HCA_score[which(ancestral.igorf$gene_type == "single")],method = "spearman"),2)),cex=0.7,adj=0)
text(x = 5.7,y=-6.5,labels = expression("P = 1.2x10"^-5),cex=0.7,adj=0)

text(x = 5.7,y=-8.5,labels = paste("Rho:",round(cor(ancestral.igorf$ancIGORF_HCA_score[which(ancestral.igorf$gene_type == "multiple")],ancestral.igorf$gene_HCA_score[which(ancestral.igorf$gene_type == "multiple")],method = "spearman"),2)),cex=0.7,col='grey40',adj = 0)
text(x = 5.7,y=-9.5,labels = expression("P = 1.2x10"^-9),cex=0.7,col='grey40',adj = 0)

mtext(text = expression(bold("B")),cex = 1,side = 2,las=2,at = 14,adj = 5)
#mtext(text = "Figure 4",cex = 0.8,side=3,at=10,line=1.5,las=1)

dev.off()

# ==================== #
#   END of Figure 4    #
# ==================== #


# ========================================================================= #
#  Calcualte Properties for AncIGORFs, de novo genes, and Translated ORFs
# ========================================================================= #
# ANCESTRAL IGORF SEQUENCES : 
ancestral.igorf.cluster = calculate.cluster.size.list(ancestral.igorf$ancIGORF_HCA_sequence,corr.seq = 'yes')
ancestral.igorf.linker  = calculate.linker.size.list(ancestral.igorf$ancIGORF_HCA_sequence,corr.seq = 'yes')
ancestral.igorf.extremities = calculate.extremities.size.list(ancestral.igorf$ancIGORF_HCA_sequence,corr.seq = 'yes')
ancestral.igorf.size = nchar(as.character(ancestral.igorf$ancIGORF_aa_sequence))

print(paste("ancIGORFs VS IGORFs linkers size ---> Mann-Whitney U-test :",
            multiple.wilcox.test(
              c(unlist(igorf.linker.size),unlist(igorf.extremities.size)),
              c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),type="l")))
print(paste("ancIGORFs VS CDS linkers size ---> Mann-Whitney U-test :",
            multiple.wilcox.test(
              c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),
              c(unlist(cds.linker.size),unlist(cds.extremities.size)),
              type="l")))

# DE NOVO GENES SEQUENCES : 
de.novo.cluster = calculate.cluster.size.list(de.novo.info$HCA_sequence,corr.seq = 'yes')
de.novo.linker = calculate.linker.size.list(de.novo.info$HCA_sequence,corr.seq = 'yes')
de.novo.extremities = calculate.extremities.size.list(de.novo.info$HCA_sequence,corr.seq = 'yes')
de.novo.size = nchar(as.character(de.novo.info$HCA_sequence))
# HIGLY TRANSLATED SEQUENCES : 
# /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ #
# The highly translated sequences are named in the script as selectively translated   #
# This name was used since the very begining and changed after.                       #
# /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ /!\ #
Selectively.translated              = info.igorf[which(info.igorf$translation_status  == "selectively"),]
Selectively.translated.clusters     = calculate.cluster.size.list    (table = info.igorf$HCA_sequence[which(info.igorf$translation_status  == "selectively")] , corr.seq = 'yes')
Selectively.translated.linkers      = calculate.linker.size.list     (table  = info.igorf$HCA_sequence[which(info.igorf$translation_status == "selectively")] , corr.seq = 'yes')
Selectively.translated.extremities  = calculate.extremities.size.list(table = info.igorf$HCA_sequence[which(info.igorf$translation_status  == "selectively")] , corr.seq = 'yes')
# OCCASIONALLY TRANSLATED SEQUENCES :
Occasionally.translated             = info.igorf[which(info.igorf$translation_status  == "occasionally"),]
Occasionally.translated.clusters    = calculate.cluster.size.list    (table = info.igorf$HCA_sequence[which(info.igorf$translation_status  == "occasionally")] , corr.seq = 'yes')
Occasionally.translated.linkers     = calculate.linker.size.list     (table  = info.igorf$HCA_sequence[which(info.igorf$translation_status == "occasionally")] , corr.seq = 'yes')
Occasionally.translated.extremities = calculate.extremities.size.list(table = info.igorf$HCA_sequence[which(info.igorf$translation_status  == "occasionally")] , corr.seq = 'yes')


# ======================================== #
#    RiboSeq reads mapping on the ORFs
# ======================================== #
# ----------------------------------------------- #
# 1. Read the table of reads positions per IGORF  #
# ----------------------------------------------- #
R1.reads.mapped = read.table(paste(files.path,"Script_and_data_for_figures/inputs/RiboSeq_reads_mapping/R1_kmer_28_paper_periodicity_all.tab",sep=""))
colnames(R1.reads.mapped) = c("ID","Pos","P0","P1","P2")
R2.reads.mapped = read.table(paste(files.path,"Script_and_data_for_figures/inputs/RiboSeq_reads_mapping/R2_kmer_28_paper_periodicity_all.tab",sep=""))
colnames(R2.reads.mapped) = c("ID","Pos","P0","P1","P2")
R3.reads.mapped = read.table(paste(files.path,"Script_and_data_for_figures/inputs/RiboSeq_reads_mapping/R3_kmer_28_paper_periodicity_all.tab",sep=""))
colnames(R3.reads.mapped) = c("ID","Pos","P0","P1","P2")
R4.reads.mapped = read.table(paste(files.path,"Script_and_data_for_figures/inputs/RiboSeq_reads_mapping/R4_kmer_28_paper_periodicity_all.tab",sep=""))
colnames(R4.reads.mapped) = c("ID","Pos","P0","P1","P2")
R5.reads.mapped = read.table(paste(files.path,"Script_and_data_for_figures/inputs/RiboSeq_reads_mapping/R5_kmer_28_paper_periodicity_all.tab",sep=""))
colnames(R5.reads.mapped) = c("ID","Pos","P0","P1","P2")
# ------------------------------------------- #
# 2. Concatenate all the reads in one table   #
# ------------------------------------------- #
Reads.mapped = as.data.frame(cbind(R1.reads.mapped$ID,R1.reads.mapped$Pos,R1.reads.mapped$P0,R2.reads.mapped$P0,R3.reads.mapped$P0,R4.reads.mapped$P0,R5.reads.mapped$P0))
colnames(Reads.mapped) = c("ID","Pos","R1_P0","R2_P0","R3_P0","R4_P0","R5_P0")
Reads.mapped$R1_P0 = as.numeric(Reads.mapped$R1_P0)
Reads.mapped$R2_P0 = as.numeric(Reads.mapped$R2_P0)
Reads.mapped$R3_P0 = as.numeric(Reads.mapped$R3_P0)
Reads.mapped$R4_P0 = as.numeric(Reads.mapped$R4_P0)
Reads.mapped$R5_P0 = as.numeric(Reads.mapped$R5_P0)
Reads.mapped = cbind(Reads.mapped,apply(Reads.mapped[,seq(3,7)],1,sum))
colnames(Reads.mapped)[8] = "Total"

# Read the Nucleotide Sequences of IGORF & CDS
igorf.nt.fasta = load.fasta.file(paste(files.path,"Random_sequences/NFASTA/Scer_Intergenic.nfasta",sep=""))
cds.nt.fasta   = load.fasta.file(paste(files.path,"Random_sequences/NFASTA/Scer_CDS.nfasta",sep=""))

# =============================== #
#  Find the first reads on ORFs   #
# =============================== #
library("seqinr")
Selectively.starts_aa     = NULL
Selectively.starts_codons = NULL
Selectively.stop_codons   = NULL
Selectively.random_aa     = NULL
Selectively.random_codons = NULL
Selectively.repartitions.first.read  =  NULL
Selectively.repartitions.orf  =  NULL
for (ig in Selectively.translated$Seq_ID){
  positions = which(Reads.mapped$Total[which(Reads.mapped$ID == ig)] != 0)
  int = positions[1]
  my.nt.seq   = splitseq(s2c(igorf.nt.fasta$Sequence[which(igorf.nt.fasta$ID == ig)]))
  Selectively.stop_codons = c(Selectively.stop_codons,my.nt.seq[length(my.nt.seq)])
  start_codon = my.nt.seq[int]
  start_aa = substr(x = igorf.fasta$Sequence[which(igorf.fasta$ID ==ig)],start = int,stop = int)
  #print(paste(ig,start_aa))
  Selectively.starts_aa = c(Selectively.starts_aa,start_aa)
  Selectively.starts_codons = c(Selectively.starts_codons,start_codon)
  Selectively.repartitions.first.read = c(Selectively.repartitions.first.read,length(which(Reads.mapped$Total[which(Reads.mapped$ID == ig)] != 0)) / (length(my.nt.seq)-int+1))
  Selectively.repartitions.orf  = c(Selectively.repartitions.orf,length(which(Reads.mapped$Total[which(Reads.mapped$ID == ig)] != 0)) / (length(my.nt.seq)))
  
  if (length(positions) > 1){
    #int = sample(positions[-1],size = 1)
    int  = positions[-1]
    start_codon = my.nt.seq[int]
    #start_aa = substr(x = igorf.fasta$Sequence[which(igorf.fasta$ID ==ig)],start = int,stop = int)
    start_aa = NULL
    for (cod in start_codon){start_aa = c(start_aa,translate(seq = s2c(cod)))}
    Selectively.random_aa = c(Selectively.random_aa,start_aa)
    Selectively.random_codons = c(Selectively.random_codons,start_codon)
  }
}

library("seqinr")
Occasionally.starts_aa     = NULL
Occasionally.starts_codons = NULL
Occasionally.stop_codons   = NULL
Occasionally.random_aa     = NULL
Occasionally.random_codons = NULL
Occasionally.repartitions.first.read  =  NULL
Occasionally.repartitions.orf  =  NULL
for (ig in Occasionally.translated$Seq_ID){
  positions = which(Reads.mapped$Total[which(Reads.mapped$ID == ig)] != 0)
  int = positions[1]
  my.nt.seq   = splitseq(s2c(igorf.nt.fasta$Sequence[which(igorf.nt.fasta$ID == ig)]))
  Occasionally.stop_codons = c(Occasionally.stop_codons,my.nt.seq[length(my.nt.seq)])
  start_codon = my.nt.seq[int]
  start_aa = translate(seq =s2c(start_codon))
  #print(paste(ig,start_aa))
  Occasionally.starts_aa     = c(Occasionally.starts_aa,start_aa)
  Occasionally.starts_codons = c(Occasionally.starts_codons,start_codon)
  Occasionally.repartitions.first.read  = c(Occasionally.repartitions.first.read,length(which(Reads.mapped$Total[which(Reads.mapped$ID == ig)] != 0)) / (length(my.nt.seq)-int+1))
  Occasionally.repartitions.orf  = c(Occasionally.repartitions.orf,length(which(Reads.mapped$Total[which(Reads.mapped$ID == ig)] != 0)) / (length(my.nt.seq)))
  
  if (length(positions) > 1){
    #int = sample(positions[-1],size = 1)
    int  = positions[-1]
    start_codon = my.nt.seq[int]
    #start_aa = substr(x = igorf.fasta$Sequence[which(igorf.fasta$ID ==ig)],start = int,stop = int)
    start_aa = NULL
    for (cod in start_codon){start_aa = c(start_aa,translate(seq = s2c(cod)))}
    #start_aa = substr(x = igorf.fasta$Sequence[which(igorf.fasta$ID ==ig)],start = int,stop = int)
    Occasionally.random_aa = c(Occasionally.random_aa,start_aa)
    Occasionally.random_codons = c(Occasionally.random_codons,start_codon)
  }
}

# =============================== #
#  Figure 5 - RiboSeq first read  #
# =============================== # ------------------------------------------------------ #
# Frequencies the 61 codons at the first translated position for highly translated 
# IGORFs (red) and occasionally translated ones (yellow). Codons which are significantly 
# observed at the first translated position compared to the other translated positions 
# are indicated with a star (z-test p.value < 0.05). Near-cognate codons are indicated 
# with diamonds. Frequencies of the 20 amino-acids at the first translated position for 
# both highly and occasionally translated IGORFs are presented in Fig. S13. 
# ---------------------------------------------------------------------------------------- #
starts.codons.both = matrix(nrow = 64,ncol = 4,data = 0)
rownames(starts.codons.both) = codon.age$V1
starts.codons.both[match(names(table(Occasionally.starts_codons)),rownames(starts.codons.both)),1] = table(Occasionally.starts_codons)
starts.codons.both[match(names(table(Selectively.starts_codons)),rownames(starts.codons.both)),2]  = table(Selectively.starts_codons)
starts.codons.both[match(names(table(Selectively.random_codons)),rownames(starts.codons.both)),3]  = table(Selectively.random_codons)
starts.codons.both[match(names(table(Occasionally.random_codons)),rownames(starts.codons.both)),4] = table(Occasionally.random_codons)
starts.codons.both = starts.codons.both[names(sort(starts.codons.both[,2],decreasing = T)),]
starts.codons.both = starts.codons.both[-which(rownames(starts.codons.both)=="TAA"),]
starts.codons.both = starts.codons.both[-which(rownames(starts.codons.both)=="TAG"),]
starts.codons.both = starts.codons.both[-which(rownames(starts.codons.both)=="TGA"),]

pdf(paste(files.path,'Script_and_data_for_figures/outputs/Papadopoulos_Fig5.pdf',sep = ""),width = 8,height = 4)
library("ineq")
plot(y=starts.codons.both[,2]/sum(starts.codons.both[,2]),frame=F,las=2,
     x=seq(1,nrow(starts.codons.both)),type="p",xaxt="n",pch=20,lwd=2,col="red",
     xlab="Codons",ylab="Frequency",ylim=c(0,0.55),cex.lab=0.9)
axis(side = 1,at = seq(1,nrow(starts.codons.both)),labels =codon.age$V4[match(rownames(starts.codons.both),codon.age$V1)],las=2,cex.axis=0.7)
points(y=c(starts.codons.both[,c(3)]/sum(starts.codons.both[,c(3)])),x=c(seq(1,nrow(starts.codons.both))),col=adjustcolor("red",alpha.f = 0.7),type="l")
points(y=c(starts.codons.both[,c(4)]/sum(starts.codons.both[,c(4)])),x=c(seq(1,nrow(starts.codons.both))),col=adjustcolor("goldenrod1",alpha.f = 0.7),type="l")
segments(x0 = seq(1,nrow(starts.codons.both)),x1 = seq(1,nrow(starts.codons.both)),y0=rep(0,nrow(starts.codons.both)),y1=starts.codons.both[,2]/sum(starts.codons.both[,2]),col="red")
segments(x0 = seq(1,nrow(starts.codons.both)),x1 = seq(1,nrow(starts.codons.both)),y0=rep(0,nrow(starts.codons.both)),y1=starts.codons.both[,1]/sum(starts.codons.both[,1]),col="goldenrod1")
points(y=starts.codons.both[,2]/sum(starts.codons.both[,2]),x=seq(1,nrow(starts.codons.both)),col='red',pch=20)
points(y=starts.codons.both[,1]/sum(starts.codons.both[,1]),
       x=seq(1,nrow(starts.codons.both)),type="p",col="goldenrod1",lwd=2,pch=20)

legend("topright",legend = c("Highly translated","Occasionally translated","Near cognate codon (or AUG)"),
       col = adjustcolor(c("red","goldenrod1","black"),alpha.f = 1),pch=c(20,20,5),pt.cex = 1.2,
       border = c("red","goldenrod1","white"),box.lwd = 0,box.col = 0,box.lty = 0,x.intersp = c(1,1,1)
)

points(x=match(near.cognate,rownames(starts.codons.both)),
       y=(starts.codons.both[,1]/sum(starts.codons.both[,1]))[match(near.cognate,rownames(starts.codons.both))],
       pch=5,cex=0.7,col="black",bg="goldenrod1")
points(x=match(near.cognate,rownames(starts.codons.both)),
       y=(starts.codons.both[,2]/sum(starts.codons.both[,2]))[match(near.cognate,rownames(starts.codons.both))],
       pch=5,cex=0.7,col="black",bg="red")

text(x = 45,y=0.3,adj = 0,labels = paste("Gini Index:",round(ineq(starts.codons.both[,2]/sum(starts.codons.both[,2]),type = "Gini"),2)),col="red")
text(x = 45,y=0.25,adj = 0,labels = paste("Gini Index:",round(ineq(starts.codons.both[,1]/sum(starts.codons.both[,1]),type = "Gini"),2)),col="goldenrod1")


sele.prop.pvals = multiple.prop.test.codons(table1 = Selectively.starts_codons,  table2 = Selectively.random_codons , type= 'g')
occ.prop.pvals  = multiple.prop.test.codons(table1 = Occasionally.starts_codons, table2 = Occasionally.random_codons , type= 'g')

points(x=which(occ.prop.pvals < 0.05),
       y=(starts.codons.both[,1]/sum(starts.codons.both[,1]))[which(occ.prop.pvals < 0.05)]+0.03,
       pch="*",cex=2,col="goldenrod1")
points(x=which(sele.prop.pvals < 0.05),
       y=(starts.codons.both[,2]/sum(starts.codons.both[,2]))[which(sele.prop.pvals < 0.05)]+0.03,
       pch="*",cex=2,col="red")

mtext(text = "Figure 5",cex = 0.8,side=3,line=1.5,las=1,adj=1)
dev.off()


# ================================= #
#  FIGURE 6 - Continuum properties  #
# ================================= # --------------------------------------------------- #
# Sequence and structural properties for the different ORF categories. Comparison of 
# (A) the sequence size, (B) cluster number, (C) cluster sizes, and (D) linker sizes for 
# each ORF categories (IGORFs in purple, occasionally translated IGORFs in yellow, 
# highly translated IGORFs in red, ancIGORFs in grey, de novo genes in blue and CDS in orange). 
# The p-values were computed with the Mann-Whitney U-test (one-sided for (A), (B), (D) 
# and two-sided for (C)). 
# Asterisks denote level of significance: *p < 0.05, **p < 0.01, ***p < 0.001. For each plot, 
# the color of the asterisks indicates the ORF category used for the comparison.
# --------------------------------------------------------------------------------------- #

# ============================= #
#  Transform data for GGPLOT    #
# ============================= #
fig.6.seq.size = as.data.frame(rbind(
  cbind(info.igorf$prot_size,rep("IGORFs",nrow(info.igorf))),
  cbind(Occasionally.translated$prot_size,rep("occasionally trans.",nrow(Occasionally.translated))),
  cbind(Selectively.translated$prot_size,rep("highly trans.",nrow(Selectively.translated))),
  cbind(nchar(ancestral.igorf$ancIGORF_aa_sequence),rep("ancIGORFs",nrow(ancestral.igorf))),
  cbind(de.novo.info$prot_size,rep("de novo genes",nrow(de.novo.info))),
  cbind(info.cds$prot_size,rep("CDS",nrow(info.cds)))))
colnames(fig.6.seq.size)    = c("seq_size","category")
fig.6.seq.size$seq_size = as.numeric(fig.6.seq.size$seq_size)
fig.6.seq.size$category     = factor(fig.6.seq.size$category, levels=unique(fig.6.seq.size$category))

fig.6.clusters.nb = as.data.frame(rbind(
  cbind(lengths(igorf.cluster.size),rep("IGORFs",length(igorf.cluster.size))),
  cbind(lengths(Occasionally.translated.clusters),rep("occasionally trans.",length(Occasionally.translated.clusters))),
  cbind(lengths(Selectively.translated.clusters),rep("highly trans.",length(Selectively.translated.clusters))),
  cbind(lengths(ancestral.igorf.cluster),rep("ancIGORFs",length(ancestral.igorf.cluster))),
  cbind(lengths(de.novo.cluster),rep("de novo genes",length(de.novo.cluster))),
  cbind(lengths(cds.cluster.size),rep("CDS",length(cds.cluster.size)))))
colnames(fig.6.clusters.nb)    = c("cluster_nb","category")
fig.6.clusters.nb$cluster_nb = as.numeric(fig.6.clusters.nb$cluster_nb)
fig.6.clusters.nb$category     = factor(fig.6.clusters.nb$category, levels=unique(fig.6.clusters.nb$category))

fig.6.clusters.size = as.data.frame(rbind(
                  cbind(unlist(igorf.cluster.size),rep("IGORFs",length(unlist(igorf.cluster.size)))),
                  cbind(unlist(Occasionally.translated.clusters),rep("occasionally trans.",length(unlist(Occasionally.translated.clusters)))),
                  cbind(unlist(Selectively.translated.clusters),rep("highly trans.",length(unlist(Selectively.translated.clusters)))),
                  cbind(unlist(ancestral.igorf.cluster),rep("ancIGORFs",length(unlist(ancestral.igorf.cluster)))),
                  cbind(unlist(de.novo.cluster),rep("de novo genes",length(unlist(de.novo.cluster)))),
                  cbind(unlist(cds.cluster.size),rep("CDS",length(unlist(cds.cluster.size))))))
colnames(fig.6.clusters.size)    = c("cluster_size","category")
fig.6.clusters.size$cluster_size = as.numeric(fig.6.clusters.size$cluster_size)
fig.6.clusters.size$category     = factor(fig.6.clusters.size$category, levels=unique(fig.6.clusters.size$category))

fig.6.linkers.size = as.data.frame(rbind(
  cbind(c(unlist(igorf.linker.size),unlist(igorf.extremities.size)),rep("IGORFs",length(c(unlist(igorf.linker.size),unlist(igorf.extremities.size))))),
  cbind(c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)),rep("occasionally trans.",length(c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities))))),
  cbind(c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),rep("highly trans.",length(c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities))))),
  cbind(c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),rep("ancIGORFs",length(c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities))))),
  cbind(c(unlist(de.novo.linker),unlist(de.novo.extremities)),rep("de novo genes",length(c(unlist(de.novo.linker),unlist(de.novo.extremities))))),
  cbind(c(unlist(cds.linker.size),unlist(cds.extremities.size)),rep("CDS",length(c(unlist(cds.linker.size),unlist(cds.extremities.size)))))))
colnames(fig.6.linkers.size)    = c("linker_size","category")
fig.6.linkers.size$linker_size = as.numeric(fig.6.linkers.size$linker_size )
fig.6.linkers.size$category     = factor(fig.6.linkers.size$category, levels=unique(fig.6.linkers.size$category))
# ================================== #
#   END - Transform data for GGPLOT  #
# ================================== #

# ========================== #
#  FIGURE 6 - Sequence Size  #
# ========================== #
rectangles = data.frame(x1=seq(1.75,4.75,1), x2=seq(2.25,5.25,1), y1=rep(230,4), y2=rep(400,4))
figure6.p1 = 
  ggplot(data = fig.6.seq.size,aes(x=category, y=seq_size,fill=category))+geom_violin(trim=T,color=NA)+ylim(0,400) +
  scale_fill_manual(values=c(adjustcolor(c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),alpha.f = 0.3)))+
  #scale_color_manual(values=c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"))+
  geom_boxplot(outlier.size = 0.0,width=0.1,col=c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),fill=c(adjustcolor(c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),alpha.f = 0.5))) + 
  theme_minimal() + ylab('Number of residues') + xlab("") + ggtitle("Sequence size") +
  theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain"),axis.text.x=element_text(size=10,angle = 45,hjust = 1)) +
  #annotate(geom="rect", xmin=rectangles$x1, xmax=rectangles$x2, ymin=rectangles$y1, ymax=rectangles$y2, alpha = .9,fill = "white")+
  # ------------- #
  #  Occasionally:
  # ------------- #
  # VS IGORF 
  annotate(geom="text",x=2,y=340,
           size=4,label = asteriscs.multiple.wilcox.test(info.igorf$prot_size,Occasionally.translated$prot_size ,type = "l"),
           col="darkorchid2",cex=1)+
  # VS Selectively
  annotate(geom="text",x=2,y=300,
           size=4,label = asteriscs.multiple.wilcox.test(Occasionally.translated$prot_size , Selectively.translated$prot_size,type = "l"),
           col="red",cex=1)+
  # VS AncIGORFs
  annotate(geom="text",x=2,y=280,
           size=4,label = asteriscs.multiple.wilcox.test(Occasionally.translated$prot_size , ancestral.igorf.size,type='l'),
           col="grey56",cex=1)+
  # VS de novo
  annotate(geom="text",x=2,y=260,
           size=4,label = asteriscs.multiple.wilcox.test(Occasionally.translated$prot_size , de.novo.info$prot_size,type = "l"),
           col="royalblue",cex=1)+
  # VS CDS
  annotate(geom="text",x=2,y=240,
           size=4,label = asteriscs.multiple.wilcox.test(Occasionally.translated$prot_size , info.cds$prot_size,type = "l") ,
           col="darkorange2",cex=1)+
  # ------------- #
  # Selectively:
  # ------------- #
  # VS IGORF
  annotate(geom="text",x=c(3),y=340,
           size=4,label = asteriscs.multiple.wilcox.test(info.igorf$prot_size,Selectively.translated$prot_size,type="l"),
           col="darkorchid2",cex=1)+
  # VS Occasionally
  annotate(geom="text",x=c(3),y=320,
           size=4,label = asteriscs.multiple.wilcox.test(Occasionally.translated$prot_size,Selectively.translated$prot_size ,type="l" ),
           col="goldenrod1",cex=1)+
  # VS AncIGORFs
  annotate(geom="text",x=c(3),y=280,
           size=4,label = asteriscs.multiple.wilcox.test(Selectively.translated$prot_size , ancestral.igorf.size,type="l"),
           col="grey56",cex=1)+
  # VS de novo
  annotate(geom="text",x=c(3),y=260,
           size=4,label = asteriscs.multiple.wilcox.test(Selectively.translated$prot_size , de.novo.info$prot_size,type="l"),
           col="royalblue",cex=1)+
  # VS CDS
  annotate(geom="text",x=c(3),y=240,
           size=4,label = asteriscs.multiple.wilcox.test(Selectively.translated$prot_size , info.cds$prot_size,type="l"),
           col="darkorange2",cex=1)+
  
  # ------------ #
  # ancIGORFs :
  # ------------ #
  # VS IGORF
  annotate(geom="text",x=4,y=340,
           size=4,label = asteriscs.multiple.wilcox.test(info.igorf$prot_size,ancestral.igorf.size , type="l"),col="darkorchid2",cex=1)+
  # VS Occasionally
  annotate(geom="text",x=4,y=320,
           size=4,label = asteriscs.multiple.wilcox.test(Occasionally.translated$prot_size , ancestral.igorf.size ,type="l"),
           col="goldenrod1",cex=1)+
  # VS selectively
  annotate(geom="text",x=4,y=300,
           size=4,label = asteriscs.multiple.wilcox.test(Selectively.translated$prot_size , ancestral.igorf.size ,type = 'l' ),
           col="red",cex=1)+
  # VS de novo
  annotate(geom="text",x=4,y=260,
           size=4,label = asteriscs.multiple.wilcox.test(ancestral.igorf.size , de.novo.info$prot_size,type='l'),
           col="royalblue",cex=1)+
  # VS CDS
  annotate(geom="text",x=4,y=240,
           size=4,label = asteriscs.multiple.wilcox.test(ancestral.igorf.size , info.cds$prot_size,type = "l"),
           col="darkorange2",cex=1)+
  # --------------- #
  # de novo genes:
  # --------------- #
  # VS IGORF
  annotate(geom="text",x=5,y=340,
           size=4,label = asteriscs.multiple.wilcox.test(info.igorf$prot_size,de.novo.info$prot_size,type="l"),
           col="darkorchid2",cex=1)+
  # VS Occasionally
  annotate(geom="text",x=5,y=320,
           size=4,label = asteriscs.multiple.wilcox.test(Occasionally.translated$prot_size , de.novo.info$prot_size , type="l"),
           col="goldenrod1",cex=1)+
  # VS Selectivelly
  annotate(geom="text",x=5,y=300,
           size=4,label = asteriscs.multiple.wilcox.test(Selectively.translated$prot_size , de.novo.info$prot_size , type = "l"),
           col="red",cex=1)+
  # VS AncIGORF
  annotate(geom="text",x=5,y=280,
           size=4,label = asteriscs.multiple.wilcox.test(ancestral.igorf.size,de.novo.info$prot_size,type="l"),
           col="grey56",cex=1)+
  # VS CDS
  annotate(geom="text",x=5,y=240,
           size=4,label = asteriscs.multiple.wilcox.test(de.novo.info$prot_size , info.cds$prot_size,type="l"),
           col="darkorange2",cex=1)



# ============================ #
#  FIGURE 6 - Clusters Number  #
# ============================ #
rectangles = data.frame(x1=seq(1.75,4.75,1), x2=seq(2.25,5.25,1), y1=rep(23,4), y2=rep(40,4))
figure6.p2 = 
ggplot(data = fig.6.clusters.nb,aes(x=category, y=cluster_nb,fill=category))+geom_violin(trim=T,color=NA)+ylim(0,40) + 
  scale_fill_manual(values=c(adjustcolor(c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),alpha.f = 0.3)))+
  geom_boxplot(outlier.size = 0.0,width=0.1,col=c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),fill=c(adjustcolor(c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),alpha.f = 0.5))) + 
  theme_minimal()+ ylab('Number of clusters') + xlab("") + ggtitle("Cluster number") + 
  theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain"),axis.text.x=element_text(size=10,angle = 45,hjust = 1)) +
  #annotate(geom="rect", xmin=rectangles$x1, xmax=rectangles$x2, ymin=rectangles$y1, ymax=rectangles$y2, alpha = .9,fill = "white") +

    # ------------- #
    # Occasionally:
    # ------------- #
    annotate(geom="text",x=2,y=34,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Occasionally.translated.clusters),
             lengths(igorf.cluster.size),
             type = "l"),col="grey56",cex=1) +
    
    annotate(geom="text",x=2,y=30,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Occasionally.translated.clusters),
             lengths(Selectively.translated.clusters),
             type = "l"),col="red",cex=1) + 
    
    annotate(geom="text",x=2,y=28,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Occasionally.translated.clusters),
             lengths(ancestral.igorf.cluster),
             type = "l"),col="grey56",cex=1) + 
    
    annotate(geom="text",x=2,y=26,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Occasionally.translated.clusters),
             lengths(de.novo.cluster),
             type = "l"),col="royalblue",cex=1) +
    
    annotate(geom="text",x=2,y=24,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Occasionally.translated.clusters),
             lengths(cds.cluster.size),
             type = "l"),col="darkorange2",cex=1) +
    
    # ------------- #
    # Selectively:
    # ------------- #
    annotate(geom="text",x=3,y=34,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(igorf.cluster.size),
             lengths(Selectively.translated.clusters),
             type = "l"),col="darkorchid2",cex=1) +
    
    annotate(geom="text",x=3,y=30,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Occasionally.translated.clusters),
             lengths(Selectively.translated.clusters),
             type = "l"),col="goldenrod1",cex=1) + 
    
    annotate(geom="text",x=3,y=28,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Selectively.translated.clusters),
             lengths(ancestral.igorf.cluster),
             type = "l"),col="grey56",cex=1) +
    
    annotate(geom="text",x=3,y=26,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Selectively.translated.clusters),
             lengths(de.novo.cluster),
             type = "l"),col="royalblue",cex=1) +
    
    annotate(geom="text",x=3,y=24,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Selectively.translated.clusters),
             lengths(cds.cluster.size),
             type = "l"),col="darkorange2",cex=1) +
    
    # AncIGORFs:
    annotate(geom="text",x=4,y=34,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(igorf.cluster.size),
             lengths(ancestral.igorf.cluster),
             type = "l"),col="darkorchid2",cex=1) +
    
    annotate(geom="text",x=4,y=32,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Occasionally.translated.clusters),
             lengths(ancestral.igorf.cluster),
             type = "l"),col="goldenrod1",cex=1) +
    
    annotate(geom="text",x=4,y=30,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Selectively.translated.clusters),
             lengths(ancestral.igorf.cluster),
             type = "l"),col="red",cex=1) +
    
    annotate(geom="text",x=4,y=26,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(ancestral.igorf.cluster),
             lengths(de.novo.cluster),
             type = "l"),col="royalblue",cex=1) +
    
    annotate(geom="text",x=4,y=24,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(ancestral.igorf.cluster),
             lengths(cds.cluster.size),
             type = "l"),col="darkorange2",cex=1) +
    
    # De novo:
    annotate(geom="text",x=5,y=34,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(igorf.cluster.size),
             lengths(de.novo.cluster),
             type = "l"),col="darkorchid2",cex=1) +
    
    annotate(geom="text",x=5,y=32,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Occasionally.translated.clusters),
             lengths(de.novo.cluster),
             type = "l"),col="goldenrod1",cex=1) +
    
    annotate(geom="text",x=5,y=30,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(Selectively.translated.clusters),
             lengths(de.novo.cluster),
             type = "l"),col="red",cex=1) +
    
    annotate(geom="text",x=5,y=28,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(ancestral.igorf.cluster),
             lengths(de.novo.cluster),
             type = "l"),col="grey56",cex=1) +
    
    annotate(geom="text",x=5,y=24,
         size=4,label = 
           asteriscs.multiple.wilcox.test(
             lengths(de.novo.cluster),
             lengths(cds.cluster.size),
             type = "l"),col="darkorange2",cex=1)


# ============================ #
#  FIGURE 6 - Clusters Size    #
# ============================ #
rectangles = data.frame(x1=4.75, x2=5.25, y1=23, y2=40)
figure6.p3 =
ggplot(data = fig.6.clusters.size,aes(x=category, y=cluster_size,fill=category))+geom_violin(trim=T,color=NA)+ylim(0,40) + 
  scale_fill_manual(values=c(adjustcolor(c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),alpha.f = 0.3)))+
  geom_boxplot(outlier.size = 0.0,width=0.1,col=c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),fill=c(adjustcolor(c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),alpha.f = 0.5))) + 
  theme_minimal()+ ylab('Number of residues') + xlab("") + ggtitle("Cluster size") +
  theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain"),axis.text.x=element_text(size=10,angle = 45,hjust = 1)) +
  #annotate(geom="rect", xmin=rectangles$x1, xmax=rectangles$x2, ymin=rectangles$y1, ymax=rectangles$y2, alpha = .9,fill = "white") +
  # -------------- #
  # de novo genes:
  # -------------- #
  # VS IGORF
  
annotate(geom="text",x=5,y=34,size=4,
     label = asteriscs.multiple.wilcox.test(
       unlist(igorf.cluster.size),
       unlist(de.novo.cluster),type="l"),
     col="darkorchid2",cex=1)+
# VS Occasionally
annotate(geom="text",x=5,y=32,size=4,
     label = asteriscs.multiple.wilcox.test(
       unlist(Occasionally.translated.clusters),
       unlist(de.novo.cluster),type="l"),
     col="goldenrod1",cex=1)+
# VS Selectively
annotate(geom="text",x=5,y=30,size=4,
     label = asteriscs.multiple.wilcox.test(
       unlist(Selectively.translated.clusters),
       unlist(de.novo.cluster),type="l"),
     col="red",cex=1)+
# VS AncIGORF
annotate(geom="text",x=5,y=28,size=4,
     label = asteriscs.multiple.wilcox.test(
       unlist(ancestral.igorf.cluster),
       unlist(de.novo.cluster),type="l"),
     col="grey56",cex=1)+
# VS CDS
annotate(geom="text",x=5,y=24,size=4,
     label = asteriscs.multiple.wilcox.test(
       unlist(de.novo.cluster),
       unlist(cds.cluster.size),type="g"),
     col="darkorange2",cex=1)


# ============================ #
#  FIGURE 6 - Linkers Size     #
# ============================ # 
rectangles = data.frame(x1=seq(1.75,4.75,1), x2=seq(2.25,5.25,1), y1=rep(23,4), y2=rep(40,4))
figure6.p4 =
ggplot(data = fig.6.linkers.size,aes(x=category, y=linker_size,fill=category))+geom_violin(trim=T,color=NA)+ylim(0,40) + 
  scale_fill_manual(values=c(adjustcolor(c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),alpha.f = 0.4)))+
  geom_boxplot(outlier.size = 0.0,width=0.1,col=c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),fill=c(adjustcolor(c("darkorchid2","goldenrod1",'red','grey56','royalblue',"darkorange2"),alpha.f = 0.5))) + 
  theme_minimal()+  ylab('Number of residues') + xlab("") + ggtitle("Linker size") + 
  theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain"),axis.text.x=element_text(size=10,angle = 45,hjust = 1)) +
  #annotate(geom="rect", xmin=rectangles$x1, xmax=rectangles$x2, ymin=rectangles$y1, ymax=rectangles$y2, alpha = 0.9,fill = "white") +

# --------------- #
# Occasionally:
# --------------- #
# VS IGORFs
annotate(geom="text",x=2,y=34,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(igorf.linker.size),unlist(igorf.extremities.size)),
       c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)),
       type="l"),
     col="darkorchid2",cex=1)+
# VS Selectively
annotate(geom="text",x=2,y=30,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)),
       c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),
       type="l"),
     col="red",cex=1)+
# VS AncIGORF
annotate(geom="text",x=2,y=28,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)),
       c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),type="l"),
     col="grey56",cex=1)+
# VS de novo
annotate(geom="text",x=2,y=26,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)),
       c(unlist(de.novo.linker),unlist(de.novo.extremities)),type="l"),
     col="royalblue",cex=1)+
# VS CDS
annotate(geom="text",x=2,y=24,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)),
       c(unlist(cds.linker.size),unlist(cds.extremities.size)),type="l"),
     col="darkorange2",cex=1)+

# -------------- #
# Selectivelly:
# -------------- #
# VS IGORF
annotate(geom="text",x=3,y=34,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(igorf.linker.size),unlist(igorf.extremities.size)),
       c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),
       type="l"),
     col="darkorchid2",cex=1)+
# VS Occasionally
annotate(geom="text",x=3,y=32,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)),
       c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),
       type="l"),
     col="darkorchid2",cex=1)+
# VS AncIGORF
annotate(geom="text",x=3,y=28,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),
       c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),type="l"),
     col="grey56",cex=1)+
# VS de novo
annotate(geom="text",x=3,y=26,size=4,label = asteriscs.multiple.wilcox.test(
  c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),
  c(unlist(de.novo.linker),unlist(de.novo.extremities)),type = "l"),
  col="royalblue",cex=1)+
# VS CDS
annotate(geom="text",x=3,y=24,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),
       c(unlist(cds.linker.size),unlist(cds.extremities.size)),type="l"),
     col="darkorange2",cex=1)+

# ----------- #
# ancIGORFs:
# ----------- #
# VS IGORF
annotate(geom="text",x=4,y=34,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(igorf.linker.size),unlist(igorf.extremities.size)),
       c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),type="l"),
     col="darkorchid2",cex=1)+
# VS Occasionally
annotate(geom="text",x=4,y=32,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)),
       c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),type="l"),
     col="goldenrod1",cex=1)+
# VS Selectively
annotate(geom="text",x=4,y=30,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),
       c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),type="l"),
     col="red",cex=1)+

# VS de novo
annotate(geom="text",x=4,y=26,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),
       c(unlist(de.novo.linker),unlist(de.novo.extremities)),type="l"),
     col="royalblue",cex=1)+
# VS CDS
annotate(geom="text",x=4,y=24,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),
       c(unlist(cds.linker.size),unlist(cds.extremities.size)),type="l"),
     col="darkorange2",cex=1)+

# -------------- #
# de novo genes:
# -------------- #
# VS IGORF
annotate(geom="text",x=5,y=34,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(igorf.linker.size),unlist(igorf.extremities.size)),
       c(unlist(de.novo.linker),unlist(de.novo.extremities)),type="l"),
     col="darkorchid2",cex=1)+
# VS Occasionally
annotate(geom="text",x=5,y=32,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)),
       c(unlist(de.novo.linker),unlist(de.novo.extremities)),type="l"),
     col="goldenrod1",cex=1)+
# VS Selectively
annotate(geom="text",x=5,y=30,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),
       c(unlist(de.novo.linker),unlist(de.novo.extremities)),type="l"),
     col="red",cex=1)+
# VS AncIGORF
annotate(geom="text",x=5,y=28,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),
       c(unlist(de.novo.linker),unlist(de.novo.extremities)),type="l"),
     col="grey56",cex=1)+
# VS CDS
annotate(geom="text",x=5,y=24,
     size=4,label = asteriscs.multiple.wilcox.test(
       c(unlist(de.novo.linker),unlist(de.novo.extremities)),
       c(unlist(cds.linker.size),unlist(cds.extremities.size)),
       type="l"),
     col="darkorange2",cex=1)


pdf(paste(files.path,"Script_and_data_for_figures/outputs/Papadopoulos_Fig6.pdf",sep = ""),height = 3.5,width = 9)
library("cowplot")
annotate_figure( 
          ggarrange(as.list(as_grob(figure6.p1)),as.list(as_grob(figure6.p2)),
          as.list(as_grob(figure6.p3)),as.list(as_grob(figure6.p4)),
          labels = 'AUTO',nrow = 1,ncol=4,font.label = list(size=12,face = "bold"))
         ,top = text_grob("Figure 6", color = "black",hjust = 1.2, x = 1, face = "plain", size = 10)
)
dev.off()

# ==================== #
#   END of Figure 6    #
# ==================== #


# ==================== #
#      Discussion      #
# ==================== #
print(paste("High HCA score IGORFs with at least ONE predicted TM : ",round(sum(info.igorf$TM_number[which(info.igorf$HCA_bin == "high")] > 0) / sum(info.igorf$HCA_bin == "high"),3)*100,"%"))
print(paste("AncIGORFs with at least ONE predicted TM : ",round(sum(ancestral.igorf$TM_number > 0) / nrow(ancestral.igorf)*100,1),"%"))
print(paste("IGORFs with at least ONE predicted TM :",round(sum(info.igorf$TM_number>0)/nrow(info.igorf)*100,1),"%"))
prop.pval = multiple.prop.test.TM.ancestral(table1 = ancestral.igorf$TM_number,table2 = info.igorf$TM_number,type = 'g')
print(paste('One proportion Z test pvalue : ',round(prop.pval,5)))

print(paste("Disorder or aggregation prone de novo proteins are most of the time (",
round(length(
            c(unique(ancestral.igorf$gene_name[which(ancestral.igorf$gene_HCA_score > higher.limit & ancestral.igorf$ancIGORF_HCA_score > higher.limit)]),
              unique(ancestral.igorf$gene_name[which(ancestral.igorf$gene_HCA_score < lower.limit  & ancestral.igorf$ancIGORF_HCA_score < lower.limit)]))) / length(unique(ancestral.igorf$gene_name[(ancestral.igorf$gene_HCA_score > higher.limit | ancestral.igorf$gene_HCA_score < lower.limit)])),2) *100,
"%) associated with ancIGORFs expected to encode disordered or aggregation-prone peptides as well",sep=""))


# ================================ #
#        Mutational Events         #
# ================================ #
mut.events = read.table(paste(files.path,"De_novo_genes_ancestral_reconstruction/mutational_events.txt",sep=""))
colnames(mut.events) = c("ID","FS","SCM")
mut.events = mut.events[-which(mut.events$ID == "YMR151W"),]

print(paste(sum(mut.events$FS)," Frame Shift in ",sum(apply(mut.events[,c(2,3)],1,sum))," total ORF triger events -->",round(sum(mut.events$FS)/sum(apply(mut.events[,c(2,3)],1,sum)),4)*100,"%",paste=""))
print(paste(sum(mut.events$SCM)," Codon Stop mutations in ",sum(apply(mut.events[,c(2,3)],1,sum))," total ORF triger events -->",round(sum(mut.events$SCM)/sum(apply(mut.events[,c(2,3)],1,sum)),4)*100,"%",paste=""))
print(paste("Frame Shift / Codon Stop mutations ratio --->",round(sum(mut.events$FS) / sum(mut.events$SCM),2)))

print(paste(length(which(mut.events$FS > 0 & mut.events$SCM > 0)) ," de novo genes with both Frame Shifts and Codon Stop Mutation",sep = ''))
print(paste(length(which(mut.events$FS == 0 & mut.events$SCM == 0)) ," de novo genes with no Frame Shifts nor Codon Stop Mutation --> Long ancIGORF de novo genes",sep = ''))
print(paste(length(which(mut.events$FS == 0 & mut.events$SCM > 0)) ," de novo genes with only Codon Stop Mutation(s)",sep = ''))
print(paste(length(which(mut.events$FS > 0 & mut.events$SCM == 0)) ," de novo genes with only  Frame Shifts(s)",sep = ''))
# ======================================================================================


# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #
#    SUPPLEMENTARY FIGURES & ANALYSES    #
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #

# ================ #
#    Figure S3     #
# ================ #
#  CDS are enriched in hydrophilic residues
#  (a) Log ratios of amino acid frequencies in HCA clusters of CDS versus HCA clusters of IGORFs. 
#  (b) Log ratios of amino acid frequencies in HCA linkers of CDS versus HCA linkers of IGORFs.

pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S3.pdf',sep=""),height = 8,width = 6)
par(mfrow=c(2,1))
# We calculate the ratio log10 of the CDS/IGORF clusters residues freqs
log.CDS.IGORF.cluster = sort(log10(CDS.freqs.cluster/igorf.freqs.cluster))
# COLORS SELECTION --------------- #
couleurs = NULL
couleurs[which(log.CDS.IGORF.cluster> 0)] = 'darkorange2'
couleurs[which(log.CDS.IGORF.cluster< 0)] = 'darkorchid3'
# -------------------------------- #
barplot(t(as.matrix(log.CDS.IGORF.cluster)),
        beside = T,
        col = adjustcolor(couleurs,alpha.f = 0.3),
        border = couleurs,
        ylim=c(-0.5,0.5),
        main='Amino acids enrichment of HCA clusters',cex.names = 0.8,las=1,yaxt='n',las=2,cex.main=0.8,
        names = c("Cys","Phe","His","Tyr","Arg","Ser","Ile","Leu","Thr","Val","Trp","Asn","Met","Gln","Lys","Ala","Gly","Glu","Asp"))
axis(side = 2,labels = c('-0.4','-0.2','0','0.2','0.4'),at = c(-0.4,-0.2,0,0.2,0.4),las=2,cex.axis=0.7)
title(ylab='log10( freq[CDS] / freq[IGORF] )',cex.lab=0.7,font.lab=2,line=2.5)

mtext(text = expression(bold("A")),cex = 1,side = 2,las=2,at = 0.7,adj = 5)

#mtext(text = "Supplemental Figure 3",cex = 0.8,side=3,line=3,las=1,adj = 1)


# We calculate the ratio log10 of the CDS/IGORF linkers residues freqs
log.CDS.IGORF.linker = sort(log10(CDS.freqs.linker /igorf.freqs.linker))
# COLORS SELECTION --------------- #
couleurs = NULL
couleurs[which(log.CDS.IGORF.linker> 0)] = 'darkorange2'
couleurs[which(log.CDS.IGORF.linker< 0)] = 'darkorchid3'
# -------------------------------- #
barplot(t(as.matrix(log.CDS.IGORF.linker)),
        beside = T,
        col = adjustcolor(couleurs,alpha.f = 0.3),
        border = couleurs,
        ylim=c(-0.5,0.5),
        main='Amino acids enrichment of HCA linkers',cex.names = 0.8,las=2,yaxt='n',cex.main=0.8,
        names = c("Cys","Arg","Trp","Phe","His","Lys","Tyr","Leu","Ile","Pro","Ser","Thr","Gln","Asn","Val","Gly","Ala","Met","Glu","Asp"))
axis(side = 2,labels = c('-0.4','-0.2','0','0.2','0.4'),at = c(-0.4,-0.2,0,0.2,0.4),las=2,cex.axis=0.7)
title(ylab='log10( freq[CDS] / freq[IGORF] )',cex.lab=0.7,font.lab=2,line=2.5)

mtext(text = expression(bold("B")),cex = 1,side = 2,las=2,at = 0.7,adj = 5)


dev.off()
# ===================== #
#   END of Figure S3    #
# ===================== #


# ================ #
#    Figure S4     #
# ================ #
#   Abundant proteins are enriched in negatively charged amino acids
#   Protein abundances of all cytoplasmic proteins are plotted against their corresponding negatively charged residues (Aspartate and Glutamate) frequencies. 
#   The Spearman rank correlation coefficient is indicated on the plot (p-value < 2.2e-16)

cytoplasm    = read.table(paste(files.path,"Script_and_data_for_figures/inputs/Abundance/CYTOPLASM_NAMES.txt",sep = ""))
abundancy    = read.table(paste(files.path,'Script_and_data_for_figures/inputs/Abundance/4932-WHOLE_ORGANISM-integrated.tab',sep = ""))
abundancy    = abundancy[match(cytoplasm$V1,abundancy$V1),]
abundancy    = abundancy[complete.cases(abundancy),]
abundancy$V2 = log10(abundancy$V2)
abundancy    = abundancy[-which(is.infinite(abundancy$V2)),] 

NEG  = NULL
HYD  = NULL
LOOP = NULL
POS  = NULL
for (i in abundancy$V1){
  my.seq = gsub(cds.fasta$Sequence[which(cds.fasta$ID == i)],pattern = "\\*",replacement = "")
  NEG = c(NEG,AA_frequencies_types(aa_tab = my.seq)[3])
  POS = c(POS,AA_frequencies_types(aa_tab = my.seq)[4])
  HYD = c(HYD,AA_frequencies_types(aa_tab = my.seq)[1])
}


library(MASS)
pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S4.pdf',sep=""),height = 4,width = 4)
plot(NEG , abundancy$V2,pch=20,cex=0.5,frame =F,cex.axis=0.8,cex.lab=0.8,
     col='grey',xlim=c(0,0.4),yaxt="n",cex.main=0.8,
     ylab='Log10(Abundancy) (ppm)',xlab='Negative charged residues frequency')
axis(side = 2,at = seq(-2,4),labels = c("-2","-1","0","1","2","3","4"),cex.axis=0.8,las=2)
contouring = kde2d(NEG,abundancy$V2 ,n = 500)
contour(contouring, drawlabels=FALSE, nlevels=4, col='black', add=TRUE,lwd = 2)
text(x=0.24,y=3,pos = 4,
     labels = paste("Rho = ",round(cor.test(NEG , abundancy$V2,method = "spearman",exact = F)$estimate,2),sep = "")
)
text(x=0.24,y=2.3,pos = 4,
     #labels = paste("Pval:2e-16",sep = "")
)
#mtext(text = "Supplemental Figure 4",cex = 0.8,side=3,line=3,las=1,adj = 1)
dev.off()
# ===================== #
#   END of Figure S4    #
# ===================== #

# ================ #
#    Figure S5     #
# ================ #

# We calculate the frequencies of the codons 
igorf.codons.freqs = codons.frequencies(table = igorf.nt.fasta$Sequence)
cds.codons.freqs   = codons.frequencies(table = cds.nt.fasta$Sequence)
# We calculate the Z-test for each amino acid & codon frequency (IGORFs -vs- CDS)
cds.freqs.VS.igorf.freqs.codons.less  = codons.z.test(table2 = igorf.nt.fasta$Sequence , table1 = cds.nt.fasta$Sequence , type = "l")
cds.freqs.VS.igorf.freqs.codons.great = codons.z.test(table2 = igorf.nt.fasta$Sequence , table1 = cds.nt.fasta$Sequence , type = "g")
cds.freqs.VS.igorf.freqs.aa.less      = aa.z.test(table2 = igorf.fasta$Sequence , table1 = cds.fasta$Sequence , type = "l")
cds.freqs.VS.igorf.freqs.aa.great     = aa.z.test(table2 = igorf.fasta$Sequence , table1 = cds.fasta$Sequence , type = "g")

pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S5.pdf',sep=""),height = 8,width = 9.5)
par(mfrow=c(2,1))
plot(igorf.freqs.residues[aa.age$V1],type="l",col='darkorchid3',pch=20,xlab='Amino acids (by chronology)',ylab='Frequency',ylim=c(0,0.12),las=1,cex.axis=0.8,cex.lab=0.8)
polygon(y = c(igorf.freqs.residues[aa.age$V1],0,0), x=c(seq(1,20),20,1),col = adjustcolor("darkorchid3",alpha.f = 1),border = F)
points(CDS.freqs.residues[aa.age$V1],type="l",pch=20,col="darkorange2")
polygon(y = c(CDS.freqs.residues[aa.age$V1],0,0), x=c(seq(1,20),20,1),col = adjustcolor("darkorange2",alpha.f = 1),border = F)
points(igorf.freqs.residues[aa.age$V1],type="l",col='darkorchid3',pch=20,xlab='Amino acids (by chronology)',ylab='Frequency',lw=2)
# Z-test Significant amino acids:
segments(x0 = which(cds.freqs.VS.igorf.freqs.aa.less[aa.age$V1] < 0.05),x1=which(cds.freqs.VS.igorf.freqs.aa.less[aa.age$V1] < 0.05),y1=0.115,y0=igorf.freqs.residues[aa.age$V1][which(cds.freqs.VS.igorf.freqs.aa.less[aa.age$V1] < 0.05)],lty=3)
segments(x0 = which(cds.freqs.VS.igorf.freqs.aa.great[aa.age$V1] < 0.05),x1=which(cds.freqs.VS.igorf.freqs.aa.great[aa.age$V1] < 0.05),y1=0.115,y0=CDS.freqs.residues[aa.age$V1][which(cds.freqs.VS.igorf.freqs.aa.great[aa.age$V1] < 0.05)],lty=3)
points(x=which(cds.freqs.VS.igorf.freqs.aa.less[aa.age$V1] < 0.05),y=rep(0.12,length(which(cds.freqs.VS.igorf.freqs.aa.less[aa.age$V1] < 0.05))),pch="*",col='darkorchid3',cex=1.5)
points(x=which(cds.freqs.VS.igorf.freqs.aa.great[aa.age$V1] < 0.05),y=rep(0.12,length(which(cds.freqs.VS.igorf.freqs.aa.great[aa.age$V1] < 0.05))),pch="*",col='darkorange3',cex=1.5)

legend("top",legend = c("IGORFs","CDS"),
       fill = adjustcolor(c("darkorchid3","darkorange2"),alpha.f = 1),
       border =c("darkorchid3","darkorange2"),box.lwd = 0,box.col = 0,box.lty = 0)

mtext(text = expression(bold("A")),cex = 1,side = 2,las=2,at = 0.15,adj = 5)
#mtext(text = "Supplemental Figure 5",cex = 0.9,side = 3,las=1,adj =1,line=2)


plot(igorf.codons.freqs[codon.age$V1],type="l",col='darkorchid3',pch=20,xlab='Codons (by chronology)',ylab='Frequency',las=2,xaxt='n',ylim=c(0,0.06),cex.axis=0.8,cex.lab=0.8)
axis(side = 1,labels = paste(codon.age$V4," (",codon.age$V3,")",sep = ""),at = seq(1,64),cex.axis=0.5,las=2)
polygon(y = c(igorf.codons.freqs[codon.age$V1],0,0), x=c(seq(1,64),64,1),col = adjustcolor("darkorchid3",alpha.f = 1),border = F)
points(cds.codons.freqs[codon.age$V1],type="l",pch=20,col="darkorange2")
polygon(y = c(cds.codons.freqs[codon.age$V1],0,0), x=c(seq(1,64),64,1),col = adjustcolor("darkorange2",alpha.f = 1),border = F)
points(igorf.codons.freqs[codon.age$V1],type="l",col='darkorchid3',pch=20,xlab='Codons (by chronology)',ylab='Frequency',las=2,xaxt='n',lw=2)
# Z-test Significant codons:
segments(x0 = which(cds.freqs.VS.igorf.freqs.codons.less[codon.age$V1] < 0.05),x1=which(cds.freqs.VS.igorf.freqs.codons.less[codon.age$V1] < 0.05),y1=0.055,y0=igorf.codons.freqs[codon.age$V1][which(cds.freqs.VS.igorf.freqs.codons.less[codon.age$V1] < 0.05)],lty=3)
segments(x0 = which(cds.freqs.VS.igorf.freqs.codons.great[codon.age$V1] < 0.05),x1=which(cds.freqs.VS.igorf.freqs.codons.great[codon.age$V1] < 0.05),y1=0.055,y0=cds.codons.freqs[codon.age$V1][which(cds.freqs.VS.igorf.freqs.codons.great[codon.age$V1] < 0.05)],lty=3)
points(x=which(cds.freqs.VS.igorf.freqs.codons.less[codon.age$V1] < 0.05),y=rep(0.06,length(which(cds.freqs.VS.igorf.freqs.codons.less[codon.age$V1] < 0.05))),pch="*",col='darkorchid3',cex=1.5)
points(x=which(cds.freqs.VS.igorf.freqs.codons.great[codon.age$V1] < 0.05),y=rep(0.06,length(which(cds.freqs.VS.igorf.freqs.codons.great[codon.age$V1] < 0.05))),pch="*",col='darkorange3',cex=1.5)

mtext(text = expression(bold("B")),cex = 1,side = 2,las=2,at = 0.08,adj = 5)

dev.off()
# ===================== #
#   END of Figure S5    #
# ===================== #


# ================= #
#    Figure S13     #
# ================= #

            # =========================== #
            #    NUCLEOTIDES RANDOMIZED   #
            # =========================== #
# ------------------------ #
#   Load the AA sequences  #
# ------------------------ #
cds.randomized.aa          = load.fasta.file(paste(files.path,"Random_sequences/Scrumbled_nucleotides/Scer_CDS_randomized.pfasta",sep=""))
ancestral.randomized.aa    = load.fasta.file(paste(files.path,"Random_sequences/Scrumbled_nucleotides/Ancestral_randomized.pfasta",sep=""))
selectively.randomized.aa  = load.fasta.file(paste(files.path,"Random_sequences/Scrumbled_nucleotides/Selectively_randomized.pfasta",sep=""))
# ----------------------------- #
#   Load the Barcode sequences  #
# ----------------------------- #
cds.randomized.barcode          = load.fasta.file(paste(files.path,"Random_sequences/Scrumbled_nucleotides/Scer_CDS_randomized.barcodes",sep=''))
ancestral.randomized.barcode    = load.fasta.file(paste(files.path,"Random_sequences/Scrumbled_nucleotides/Ancestral_randomized.barcodes",sep=''))
selectively.randomized.barcode  = load.fasta.file(paste(files.path,"Random_sequences/Scrumbled_nucleotides/Selectively_randomized.barcodes",sep=''))
# ------------------------------------- #
#   Calculate Clusters & Linkers Sizes  #
# ------------------------------------- #
cds.randomized.cluster = calculate.cluster.size.list(cds.randomized.barcode$Sequence,     corr.seq = "yes")
cds.randomized.linker  = calculate.linker.size.list(cds.randomized.barcode$Sequence,      corr.seq = "yes")
cds.randomized.extreme = calculate.extremities.size.list(cds.randomized.barcode$Sequence, corr.seq = "yes")
ancestral.randomized.cluster = calculate.cluster.size.list(ancestral.randomized.barcode$Sequence,     corr.seq = "yes")
ancestral.randomized.linker  = calculate.linker.size.list(ancestral.randomized.barcode$Sequence,      corr.seq = "yes")
ancestral.randomized.extreme = calculate.extremities.size.list(ancestral.randomized.barcode$Sequence, corr.seq = "yes")
selectively.randomized.cluster = calculate.cluster.size.list(selectively.randomized.barcode$Sequence,     corr.seq = "yes")
selectively.randomized.linker  = calculate.linker.size.list(selectively.randomized.barcode$Sequence,      corr.seq = "yes")
selectively.randomized.extreme = calculate.extremities.size.list(selectively.randomized.barcode$Sequence, corr.seq = "yes")

            # ================================================== #
            #     IGORF FREQUENCY NUCLEOTIDES - VARIOUS SIZES    #
            # ================================================== #
# ------------------------ #
#   Load the AA sequences  #
# ------------------------ #
cds.size.igorf.freq.aa          = load.fasta.file(paste(files.path,"Random_sequences/IGORF_NT_freq_various_sizes/CDS-size_IGORF-freq.pfasta",sep=""))
ancestral.size.igorf.freq.aa    = load.fasta.file(paste(files.path,"Random_sequences/IGORF_NT_freq_various_sizes/AncIGORF-size_IGORF-freq.pfasta",sep=""))
selectively.size.igorf.freq.aa  = load.fasta.file(paste(files.path,"Random_sequences/IGORF_NT_freq_various_sizes/Selectively-size_IGORF-freq.pfasta",sep=""))
# ----------------------------- #
#   Load the Barcode sequences  #
# ----------------------------- #
cds.size.igorf.freq.barcode          = load.fasta.file(paste(files.path,"Random_sequences/IGORF_NT_freq_various_sizes/CDS-size_IGORF-freq.barcodes",sep=""))
ancestral.size.igorf.freq.barcode    = load.fasta.file(paste(files.path,"Random_sequences/IGORF_NT_freq_various_sizes/AncIGORF-size_IGORF-freq.barcodes",sep=""))
selectively.size.igorf.freq.barcode  = load.fasta.file(paste(files.path,"Random_sequences/IGORF_NT_freq_various_sizes/Selectively-size_IGORF-freq.barcodes",sep=""))
# ------------------------------------- #
#   Calculate Clusters & Linkers Sizes  #
# ------------------------------------- #
cds.size.igorf.freq.cluster = calculate.cluster.size.list(cds.size.igorf.freq.barcode$Sequence,corr.seq = 'yes')
cds.size.igorf.freq.linker  = calculate.linker.size.list(cds.size.igorf.freq.barcode$Sequence,corr.seq = 'yes')
cds.size.igorf.freq.extreme = calculate.extremities.size.list(cds.size.igorf.freq.barcode$Sequence,corr.seq = 'yes')
ancestral.size.igorf.freq.cluster = calculate.cluster.size.list(ancestral.size.igorf.freq.barcode$Sequence,corr.seq = 'yes')
ancestral.size.igorf.freq.linker  = calculate.linker.size.list(ancestral.size.igorf.freq.barcode$Sequence,corr.seq = 'yes')
ancestral.size.igorf.freq.extreme = calculate.extremities.size.list(ancestral.size.igorf.freq.barcode$Sequence,corr.seq = 'yes')
selectively.size.igorf.freq.cluster = calculate.cluster.size.list(selectively.size.igorf.freq.barcode$Sequence,corr.seq = 'yes')
selectively.size.igorf.freq.linker  = calculate.linker.size.list(selectively.size.igorf.freq.barcode$Sequence,corr.seq = 'yes')
selectively.size.igorf.freq.extreme = calculate.extremities.size.list(selectively.size.igorf.freq.barcode$Sequence,corr.seq = 'yes')

# ========================= #
#  Prepare data for GGPLOT  #
# ========================= #

      # ============= #
      #  Clusters NB  #
      # ============= #
# ------- #
#   CDS   #
# ------- #
cds.randomized.nt.cnb = as.data.frame( rbind(
  cbind(lengths(igorf.cluster.size),rep("IGORFs",length(igorf.cluster.size))),
  cbind(lengths(cds.size.igorf.freq.cluster),rep("artificial\nIGORFs",length(cds.size.igorf.freq.cluster))),
  cbind(lengths(cds.randomized.cluster),rep("scrambled\nCDS",length(cds.randomized.cluster))),
  cbind(lengths(cds.cluster.size),rep("CDS",length(cds.cluster.size)))
))
colnames(cds.randomized.nt.cnb)  = c("cluster_nb","category")
cds.randomized.nt.cnb$cluster_nb = as.numeric(cds.randomized.nt.cnb$cluster_nb)
cds.randomized.nt.cnb$category   = factor(cds.randomized.nt.cnb$category, levels=unique(cds.randomized.nt.cnb$category))

      # ============== #
      # Clusters size  #
      # ============== #
# ------- #
#   CDS   #
# ------- #
cds.randomized.nt.c = as.data.frame( rbind(
  cbind(unlist(igorf.cluster.size),rep("IGORFs",length(unlist(igorf.cluster.size)))),
  cbind(unlist(cds.size.igorf.freq.cluster),rep("artificial\nIGORFs",length(unlist(cds.size.igorf.freq.cluster)))),
  cbind(unlist(cds.randomized.cluster),rep("scrambled\nCDS",length(unlist(cds.randomized.cluster)))),
  cbind(unlist(cds.cluster.size),rep("CDS",length(unlist(cds.cluster.size))))
))
colnames(cds.randomized.nt.c)  = c("cluster_size","category")
cds.randomized.nt.c$cluster_size = as.numeric(cds.randomized.nt.c$cluster_size)
cds.randomized.nt.c$category   = factor(cds.randomized.nt.c$category, levels=unique(cds.randomized.nt.c$category))

      # ============== #
      #  Linkers size  #
      # ============== #
# ------- #
#  CDS    #
# ------- #
cds.randomized.nt.le = as.data.frame(rbind(
  cbind(c(unlist(igorf.linker.size),unlist(igorf.extremities.size)),rep("IGORFs",length(c(unlist(igorf.linker.size),unlist(igorf.extremities.size))))),
  cbind(
    c(unlist(cds.size.igorf.freq.linker),unlist(cds.size.igorf.freq.extreme)),
    rep("artificial\nIGORFs",length(c(unlist(cds.size.igorf.freq.linker),unlist(cds.size.igorf.freq.extreme))))),
  cbind(c(unlist(cds.randomized.linker),unlist(cds.randomized.extreme)),rep("scrambled\nCDS",length(c(unlist(cds.randomized.linker),unlist(cds.randomized.extreme))))),
  cbind(c(unlist(cds.linker.size),unlist(cds.extremities.size)),rep("CDS",length(c(unlist(cds.linker.size),unlist(cds.extremities.size)))))
))
colnames(cds.randomized.nt.le) = c("linker_size","category")
cds.randomized.nt.le$linker_size = as.numeric(cds.randomized.nt.le$linker_size)
cds.randomized.nt.le$category <- factor(cds.randomized.nt.le$category, levels=unique(cds.randomized.nt.le$category))

# -------------- #
#  Selectively  #
# -------------- #
selectively.randomized.nt.le = as.data.frame(rbind(
  cbind(c(unlist(igorf.linker.size),unlist(igorf.extremities.size)),rep("IGORFs",length(c(unlist(igorf.linker.size),unlist(igorf.extremities.size))))),
  cbind(
    c(unlist(selectively.size.igorf.freq.linker),unlist(selectively.size.igorf.freq.extreme)),
    rep("artificial\nIGORFs",length(c(unlist(selectively.size.igorf.freq.linker),unlist(selectively.size.igorf.freq.extreme))))),
  cbind(c(unlist(selectively.randomized.linker),unlist(selectively.randomized.extreme)),rep("scrambled\nhighly trans.",length(c(unlist(selectively.randomized.linker),unlist(selectively.randomized.extreme))))),
  cbind(c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),rep("highly trans.",length(c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)))))
))
colnames(selectively.randomized.nt.le) = c("linker_size","category")
selectively.randomized.nt.le$linker_size = as.numeric(selectively.randomized.nt.le$linker_size)
selectively.randomized.nt.le$category <- factor(selectively.randomized.nt.le$category, levels=unique(selectively.randomized.nt.le$category))

# -------------- #
#    Ancestral   #
# -------------- #
ancestral.randomized.nt.le = as.data.frame(rbind(
  cbind(c(unlist(igorf.linker.size),unlist(igorf.extremities.size)),rep("IGORFs",length(c(unlist(igorf.linker.size),unlist(igorf.extremities.size))))),
  cbind(
    c(unlist(ancestral.size.igorf.freq.linker),unlist(ancestral.size.igorf.freq.extreme)),
    rep("artificial\nIGORFs",length(c(unlist(ancestral.size.igorf.freq.linker),unlist(ancestral.size.igorf.freq.extreme))))),
  cbind(c(unlist(ancestral.randomized.linker),unlist(ancestral.randomized.extreme)),rep("scrambled\nancIGORFs",length(c(unlist(ancestral.randomized.linker),unlist(ancestral.randomized.extreme))))),
  cbind(c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),rep("ancIGORFs",length(c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)))))
))
colnames(ancestral.randomized.nt.le) = c("linker_size","category")
ancestral.randomized.nt.le$linker_size = as.numeric(ancestral.randomized.nt.le$linker_size)
ancestral.randomized.nt.le$category <- factor(ancestral.randomized.nt.le$category, levels=unique(ancestral.randomized.nt.le$category))


pS6A <-   ggplot(data = cds.randomized.nt.cnb,aes(x=category, y=cluster_nb,fill=category))+geom_violin(trim=T,color=NA)+ylim(0,40) + 
  scale_fill_manual(values=c(adjustcolor('darkorchid',alpha.f = 0.7),"white",adjustcolor('darkorange2',alpha.f = 0.2),adjustcolor('darkorange2',alpha.f = 0.7)))+
  geom_boxplot(outlier.size = 0.0,width=0.1,col=c("black"),fill=c(adjustcolor('darkorchid',alpha.f = 0.7),"white",adjustcolor('darkorange2',alpha.f = 0.2),adjustcolor('darkorange2',alpha.f = 0.7))) + 
  theme_minimal()+ theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10)) + ylab('Number of clusters') + xlab("") + ggtitle("Cluster number") + 
  annotate(geom="text",x=3,col='darkorange3',y=30,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.cnb[which(cds.randomized.nt.cnb$category == "CDS"),1],tableB = cds.randomized.nt.cnb[which(cds.randomized.nt.cnb$category == "scrambled\nCDS"),1],type = "l")) +
  annotate(geom="text",x=2,col='darkorange3',y=30,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.cnb[which(cds.randomized.nt.cnb$category == "CDS"),1],tableB = cds.randomized.nt.cnb[which(cds.randomized.nt.cnb$category == "artificial\nIGORFs"),1],type = "l")) +
  annotate(geom="text",x=3,col='darkorchid3',y=28,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.cnb[which(cds.randomized.nt.cnb$category == "IGORFs"),1],tableB = cds.randomized.nt.cnb[which(cds.randomized.nt.cnb$category == "scrambled\nCDS"),1],type = "l")) +
  annotate(geom="text",x=2,col='darkorchid3',y=28,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.cnb[which(cds.randomized.nt.cnb$category == "IGORFs"),1],tableB = cds.randomized.nt.cnb[which(cds.randomized.nt.cnb$category == "artificial\nIGORFs"),1],type = "l")) 
  
pS6B <-   ggplot(data = cds.randomized.nt.c,aes(x=category, y=cluster_size,fill=category))+geom_violin(trim=T)+ylim(0,40) + 
  scale_fill_manual(values=c(adjustcolor('darkorchid',alpha.f = 0.7),"white",adjustcolor('darkorange2',alpha.f = 0.2),adjustcolor('darkorange2',alpha.f = 0.7)))+
  geom_boxplot(outlier.size = 0.0,width=0.1,col=c("black"),fill=c(adjustcolor('darkorchid',alpha.f = 0.7),"white",adjustcolor('darkorange2',alpha.f = 0.2),adjustcolor('darkorange2',alpha.f = 0.7))) + 
  theme_minimal()+ theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10)) + ylab('Number of residues') + xlab("") + ggtitle("Cluster size") + 
  annotate(geom="text",x=3,col='darkorange3',y=30,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.c[which(cds.randomized.nt.c$category == "CDS"),1],tableB = cds.randomized.nt.c[which(cds.randomized.nt.c$category == "scrambled\nCDS"),1],type = "two.sided")) +
  annotate(geom="text",x=2,col='darkorange3',y=30,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.c[which(cds.randomized.nt.c$category == "CDS"),1],tableB = cds.randomized.nt.c[which(cds.randomized.nt.c$category == "artificial\nIGORFs"),1],type = "two.sided")) +
  annotate(geom="text",x=3,col='darkorchid3',y=28,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.c[which(cds.randomized.nt.c$category == "IGORFs"),1],tableB = cds.randomized.nt.c[which(cds.randomized.nt.c$category == "scrambled\nCDS"),1],type = "two.sided")) +
  annotate(geom="text",x=2,col='darkorchid3',y=28,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.c[which(cds.randomized.nt.c$category == "IGORFs"),1],tableB = cds.randomized.nt.c[which(cds.randomized.nt.c$category == "artificial\nIGORFs"),1],type = "two.sided")) 

pS6C <-   ggplot(data = cds.randomized.nt.le,aes(x=category, y=linker_size,fill=category))+geom_violin(trim=T)+ylim(0,40) + 
  scale_fill_manual(values=c(adjustcolor('darkorchid',alpha.f = 0.7),"white",adjustcolor('darkorange2',alpha.f = 0.2),adjustcolor('darkorange2',alpha.f = 0.7)))+
  geom_boxplot(outlier.size = 0.0,width=0.1,col=c("black"),fill=c(adjustcolor('darkorchid',alpha.f = 0.7),"white",adjustcolor('darkorange2',alpha.f = 0.2),adjustcolor('darkorange2',alpha.f = 0.7))) + 
  theme_minimal()+ theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10)) + ylab('Number of residues') + xlab("") + ggtitle("Linker size") + 
  annotate(geom="text",x=3,col='darkorange3',y=30,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.le[which(cds.randomized.nt.le$category == "CDS"),1],tableB = cds.randomized.nt.le[which(cds.randomized.nt.le$category == "scrambled\nCDS"),1],type = "g")) +
  annotate(geom="text",x=2,col='darkorange3',y=30,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.le[which(cds.randomized.nt.le$category == "CDS"),1],tableB = cds.randomized.nt.le[which(cds.randomized.nt.le$category == "artificial\nIGORFs"),1],type = "g")) +
  annotate(geom="text",x=3,col='darkorchid3',y=28,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.le[which(cds.randomized.nt.le$category == "IGORFs"),1],tableB = cds.randomized.nt.le[which(cds.randomized.nt.le$category == "scrambled\nCDS"),1],type = "l")) +
  annotate(geom="text",x=2,col='darkorchid3',y=28,size=5, label= asteriscs.multiple.wilcox.test(tableA = cds.randomized.nt.le[which(cds.randomized.nt.le$category == "IGORFs"),1],tableB = cds.randomized.nt.le[which(cds.randomized.nt.le$category == "artificial\nIGORFs"),1],type = "l")) 

pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S13.pdf',sep=""),height = 3.5,width = 9)
annotate_figure(
ggarrange(pS6A,pS6B,pS6C,labels="AUTO",ncol=3,nrow = 1,font.label = list(size=12,face = "bold"))
#,bottom = text_grob("Supplemental Figure 6", color = "black",hjust = 1.2, x = 1, face = "plain", size = 10)
)
dev.off()
# ====================== #
#   END of Figure S13    #
# ====================== #

# ================= #
#    Figure S14     #
# ================= #
cds.randomized.freqs.residues      = residues_frequencies(cds.randomized.aa$Sequence,cds.randomized.barcode$Sequence,type="All")
cds.size.igorf.freq.freqs.residues = residues_frequencies(cds.size.igorf.freq.aa$Sequence,cds.size.igorf.freq.barcode$Sequence,type="All")
library(fmsb)
freqs.mat.cluster = rbind(rep(x = 0.12,20),rep(x=0,20),
                          CDS.freqs.residues,
                          cds.randomized.freqs.residues,
                          cds.size.igorf.freq.freqs.residues
)
colnames(freqs.mat.cluster) = names(CDS.freqs.residues)

pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S14.pdf',sep=""),height = 4,width = 5)
par(mar=c(2,0,3,0))
radarchart(as.data.frame(freqs.mat.cluster),seg = 2,maxmin = T,axistype = 1,
           pcol = c(adjustcolor("darkorange2",alpha.f = 0.2),"darkorange2","black"),
           cglcol = "grey",caxislabels = seq(0,0.12,0.06),axislabcol = "grey",cglty = 3,pty = c(32,32,32),calcex=0.8,plwd=c(1,2,2),
           title = "Amino acids frequencies",plty=c(0,1,1),cex.main=0.8,pfcol = c(adjustcolor("darkorange2",alpha.f = 0.2),adjustcolor("royalblue",alpha.f = 0),adjustcolor("royalblue",alpha.f = 0))
           #pfcol = adjustcolor(c("darkorange2","darkorchid2"),alpha.f = 0.2)
)
legend(x=0.8,y=-1,inset=c(-0.2,0),xpd=TRUE,legend = c("CDS","Scrambled CDS","Artificial IGORFs"),
       fill = c(adjustcolor("darkorange2",alpha.f = 0.2),adjustcolor("royalblue",alpha.f = 0),adjustcolor("royalblue",alpha.f = 0)),
       border = adjustcolor("royalblue",alpha.f = 0),lwd=2,
       col = c(adjustcolor("royalblue",alpha.f = 0),"darkorange2","black"),pch=c(-1,-1,-1),
       bg=F,box.lty = 0,box.lwd = 0,cex = 0.7,lty=c(1,1,1)
)
#mtext(text = "Supplemental Figure 14",cex = 0.8,side=3,line=2,las=1,adj=0.9)
dev.off()
# ====================== #
#   END of Figure S14    #
# ====================== #

# ================= #
#    Figure S12     #
# ================= #
pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S12.pdf',sep=""),height = 4,width = 8)
library("colorspace")
par(mfrow=c(1,2))
cols = diverge_hcl(n = 9,"Blue-Red")
par(mar=c(5,4,4,3) + 0.1)
plot(0,-10,ylim=c(0,60),xlim=c(0,8),frame=F,xlab="Size Group (residues)",ylab="Number of residues",las=1,xaxt='n',main="Cluster size",cex.main=0.8,cex.lab = 0.8,cex.axis=0.7)
axis(side = 1,at = seq(1,7),labels = c("20-30","30-50","50-70","70-100","100-150","150-200","200-1000"),las=2,cex.axis=0.5)
upper.limits = c(30,50,70,100,150,200,1000)
for (i in seq(0.1,0.9,0.1)){
  count = 0
  clusters = NULL
  cluster.medians = NULL
  cluster.medians.25 = NULL
  cluster.medians.75 = NULL
  for (size in c(20,30,50,70,100,150,200)){
    count = count + 1
    barcodes = load.fasta.file( paste(files.path,"Random_sequences/Size_&_Hydrophobicity/","Sequences_",size,"-",upper.limits[count],"_",i*100,".barcodes",sep=""))
    clusters[[count]] = unlist(calculate.cluster.size.list(barcodes$Sequence,corr.seq = 'yes'))  
    cluster.medians = c(cluster.medians , median(unlist(clusters)))
  }
  points(seq(1,7),cluster.medians,col=adjustcolor(cols[i*10],alpha.f = 1),pch=20,type = "b",cex=0.4,lwd=2)
}
text(x=4,y=57,labels = "hydrophobicity content",cex = 0.7)
par(new=T)
par(mar=c(15.5,5,4,5))
image(x = seq(1,9),y=1,z=t(matrix(nrow = 1,ncol = 9,data = seq(1,9))),col = diverge_hcl(n = 9,"Blue-Red"),yaxt="n",ylab="",xaxt="n",xlab="")
axis(side = 1,at = seq(1,9),labels = as.character(seq(0.1,0.9,0.1)),cex.axis=0.5,tick = F,padj = -6.5)
# ===================
par(mar=c(5,4,4,3) + 0.1)
plot(0,-10,ylim=c(0,60),xlim=c(0,8),frame=F,xlab="Size Group (residues)",ylab="Number of residues",las=1,xaxt='n',main="Linker size",cex.main=0.8,cex.lab = 0.8,cex.axis=0.7)
axis(side = 1,at = seq(1,7),labels = c("20-30","30-50","50-70","70-100","100-150","150-200","200-1000"),las=2,cex.axis=0.5)
upper.limits = c(30,50,70,100,150,200,1000)
for (i in seq(0.1,0.9,0.1)){
  count = 0
  linkers  = NULL
  extreme  = NULL
  linker.medians = NULL
  linker.medians.25 = NULL
  linker.medians.75 = NULL
  for (size in c(20,30,50,70,100,150,200)){
    count = count + 1
    barcodes = load.fasta.file( paste(files.path,"Random_sequences/Size_&_Hydrophobicity/","Sequences_",size,"-",upper.limits[count],"_",i*100,".barcodes",sep=""))
    linkers[[count]]  = unlist(calculate.linker.size.list(barcodes$Sequence,corr.seq = 'yes'))
    extreme[[count]]  = unlist(calculate.extremities.size.list(barcodes$Sequence,corr.seq = 'yes'))
    
    linker.medians = c(linker.medians,median(c(unlist(extreme),unlist(linkers))))
    linker.medians.25 = c(linker.medians.25,quantile(x = c(unlist(extreme),unlist(linkers)),0.25))
    linker.medians.75 = c(linker.medians.75,quantile(x = c(unlist(extreme),unlist(linkers)),0.75))
  }
  points(seq(1,7),linker.medians,col=adjustcolor(cols[i*10],alpha.f = 1),pch=20,type = "b",cex=0.4,lwd=2)
}
text(x=4,y=57,labels = "hydrophobicity content",cex = 0.7)
#mtext(text = "Supplemental Figure 12",cex = 0.8,side=3,las=1,at=7,line=3)
par(new=T)
par(mar=c(15.5,5,4,5))
image(x = seq(1,9),y=1,z=t(matrix(nrow = 1,ncol = 9,data = seq(1,9))),col = diverge_hcl(n = 9,"Blue-Red"),yaxt="n",ylab="",xaxt="n",xlab="")
axis(side = 1,at = seq(1,9),labels = as.character(seq(0.1,0.9,0.1)),cex.axis=0.5,tick = F,padj = -6.5)

dev.off()
# ====================== #
#   END of Figure S12    #
# ====================== #

# ================ #
#    Figure S6     #
# ================ #
#   IGORFs encompass the large spectrum of fold potential of canonical proteins (raw data)
#   (a) Histograms of the HCA scores of the three reference datasets (i.e. disordered regions, globular domains and transmembrane regions - green, black and pink histograms respectively). 
#   Dotted black lines delineate the boundaries of the low, intermediate and high HCA score categories. 
#   The boundaries are defined so that 95% of globular domains fall into the intermediate HCA score category whereas the low and high HCA score categories include all sequences with HCA values that are lower or higher than those of 97.5% of globular domains respectively. 
#   (b) Histograms of the HCA scores of CDS and IGORFs. The percentages of sequences in each category are given for all datasets.
pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S6.pdf',sep=""),height = 6,width = 4.5)
par(mfrow=c(2,1))
par(mar=c(4.2,3,2,2.5))
hist(info.disorder$HCA,freq = F,cex.axis=0.8,
     col=adjustcolor('forestgreen',alpha.f = 0.2),
     border = F,ylim=c(0,0.40),xlim=c(-10,10),lwd=2,main='',
     xlab = 'HCA score',cex.lab=0.8,yaxt='n')
hist(info.transmembrane$HCA,freq = F,breaks=20,
     col=adjustcolor('mediumvioletred',alpha.f = 0.2),border = F,add=T)

hist(info.globular$HCA,freq = F,breaks=20,
     col=adjustcolor('black',alpha.f = 0.2),border = F,add=T)

legend(x=-3.5,y=0.43,
       c('Globular Regions','Disordered Regions','Transmembrane Regions'),
       fill =c(adjustcolor(c('black','forestgreen','mediumvioletred'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=0.7,horiz = F,ncol = 1)
axis(side = 2,labels = c('0','0.1','0.2','0.3'),at=c(0,0.1,0.2,0.3),las=2,cex.axis=0.7)
segments(x0 = lower.limit,y0=0,x1= lower.limit, y1=0.31,lty=3)
segments(x0 = higher.limit,y0=0,x1= higher.limit, y1=0.31,lty=3)

text(x=1,y=0.27,labels ='Intermediate HCA',cex=0.8)
text(x=1,y=0.15,labels = ('95%'),col='black',cex=0.8)
text(x=1,y=0.11,labels = ('2.5%'),col='forestgreen',cex=0.8)
text(x=1,y=0.07,labels = ('29.9%'),col='mediumvioletred',cex=0.8)
text(x=8.3,y=0.27,labels = 'High HCA',cex=0.8)
text(x=8.3,y=0.15,labels = ('2.5%'),col='black',cex=0.8)
text(x=8.3,y=0.11,labels = ('0%'),col='forestgreen',cex=0.8)
text(x=8.3,y=0.07,labels = ('69.2%'),col='mediumvioletred',cex=0.8)
text(x=-7,y=0.27,labels = 'Low HCA',cex=0.8)
text(x=-7,y=0.15,labels = ('2.5%'),col='black',cex=0.8)
text(x=-7,y=0.11,labels = ('97.5%'),col='forestgreen',cex=0.8)
text(x=-7,y=0.07,labels = ('0.9%'),col='mediumvioletred',cex=0.8)
mtext(text = expression(bold("A")),cex = 0.9,side = 2,las=2,at = 0.45,adj = 4)

#mtext(text = "Supplemental Figure 6",cex = 0.8,side=3,las=1,line=1,at=9)

hist(info.cds$HCA_score,freq = F,cex.axis=0.8,
     col=adjustcolor('darkorange2',alpha.f = 0.2),
     border = F,ylim=c(0,0.40),xlim=c(-10,10),lwd=2,main='',
     xlab = 'HCA score',cex.lab=0.8,frame.plot=F,yaxt='n')
hist(info.igorf$HCA_score,freq = F,breaks=20,
     col=adjustcolor('darkorchid3',alpha.f = 0.2),border = F,add=T)
legend(x=-3.5,y=0.43,
       c('CDS','IGORFs'),
       fill =c(adjustcolor(c('darkorange2','darkorchid3'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=0.7,horiz = F,ncol = 1)
axis(side = 2,labels = c('0','0.1','0.2','0.3'),at=c(0,0.1,0.2,0.3),las=2,cex.axis=0.7)
segments(x0 = lower.limit,y0=0,x1= lower.limit, y1=0.31,lty=3)
segments(x0 = higher.limit,y0=0,x1= higher.limit, y1=0.31,lty=3)

text(x=1,y=0.27,labels ='Intermediate HCA',cex=0.8)
text(x=1,y=0.15,labels = paste(round(sum(info.cds$HCA_score <= higher.limit & info.cds$HCA_score >= lower.limit) / nrow(info.cds) * 100,1),'%',sep=""),col='darkorange2',cex=0.8)
text(x=1,y=0.11,labels = paste(round(sum(info.igorf$HCA_score <= higher.limit & info.igorf$HCA_score >= lower.limit) / nrow(info.igorf) * 100,1),'%',sep=""),col='darkorchid3',cex=0.8)
text(x=8.3,y=0.27,labels = 'High HCA',cex=0.8)
text(x=8.3,y=0.15,labels = paste(round(sum(info.cds$HCA_score > higher.limit) / nrow(info.cds) * 100,1),'%'),col='darkorange2',cex=0.8)
text(x=8.3,y=0.11,labels = paste(round(sum(info.igorf$HCA_score > higher.limit) / nrow(info.igorf) * 100,1),'%'),col='darkorchid3',cex=0.8)
text(x=-7,y=0.27,labels = 'Low HCA',cex=0.8)
text(x=-7,y=0.15,labels = paste(round(sum(info.cds$HCA_score < lower.limit) / nrow(info.cds) * 100,1),"%",sep=""),col='darkorange2',cex=0.8)
text(x=-7,y=0.11,labels = paste(round(sum(info.igorf$HCA_score < lower.limit) / nrow(info.igorf) * 100,1),"%",sep=""),col='darkorchid3',cex=0.8)
mtext(text = expression(bold("B")),cex = 0.9,side = 2,las=2,at = 0.45,adj = 4)
dev.off()
# ===================== #
#   END of Figure S6    #
# ===================== #



# ================ #
#     Figure S9    #
# ================ #
#   (a) Boxplot comparing the sequence size of multiple and single ancIGORF de novo genes. 
#   (b) Boxplot comparing the sequence size of ancIGORFs preceding the emergence of single and multiple ancIGORF de novo genes. 
pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S9.pdf',sep=""),height = 3.5,width = 5.5)
par(mfrow=c(1,2))
boxplot(
  info.cds$prot_size[match(unique(ancestral.igorf$gene_name[which(ancestral.igorf$gene_type == "single")]),info.cds$Gene)],
  info.cds$prot_size[match(unique(ancestral.igorf$gene_name[which(ancestral.igorf$gene_type == "multiple")]),info.cds$Gene)],
  #main = expression('Size of'~italic('de novo')~ 'genes'),
  main = expression('Size of de novo genes'),
  cex.main=0.7,pch=20,cex.lab=0.8,cex.axis=0.8,
  frame=F,names=c("Single","Multiple"),las=2,cex.names=0.7,ylab='number of residues',ylim=c(0,210),col=c(adjustcolor("black",alpha.f = 0.5)) 
)
boxplot(
  nchar(ancestral.igorf$ancIGORF_aa_sequence[which(ancestral.igorf$gene_type == 'single')]),
  nchar(ancestral.igorf$ancIGORF_aa_sequence[which(ancestral.igorf$gene_type == 'multiple')]),
  #main = expression(atop('Size of ancIGORFs','per type of'~italic('de novo')~'gene')),
  main = expression(atop('Size of ancIGORFs','per type of de novo gene')),
  cex.main=0.7,pch=20,cex.lab=0.8,cex.axis=0.8,
  frame=F,names=c("Single","Multiple"),las=2,cex.names=0.7,ylab='number of residues',ylim=c(0,210),col=c(adjustcolor("black",alpha.f = 0.5)) 
)
#mtext(text = "Supplemental Figure 9",cex = 0.7,side=3,las=1,line=3.2,adj=1.9)
dev.off()
# ===================== #
#   END of Figure S9    #
# ===================== #

# ================= #
#    Figure S10     #
# ================= #
starts_both     = cbind(table(Occasionally.starts_aa),rep(0,21),rep(0,21),rep(0,21))
starts_both[match(names(table(Selectively.starts_aa)),names(table(Occasionally.starts_aa))),2] = table(Selectively.starts_aa)
starts_both[match(names(table(Selectively.random_aa)),names(table(Occasionally.starts_aa))),3] = table(Selectively.random_aa)
starts_both[match(names(table(Occasionally.random_aa)),names(table(Occasionally.starts_aa))),4] = table(Occasionally.random_aa)
starts_both = starts_both[names(sort(starts_both[,2],decreasing = T)),]
starts_both = starts_both[-which(rownames(starts_both) == "*"),]

pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S10.pdf',sep=""),width = 5,height = 4)
library("ineq")
plot(y=starts_both[,2]/sum(starts_both[,2]),frame=F,las=2,x=seq(1,20),type="p",xaxt="n",pch=20,lwd=2,col="red",ylim=c(0,0.6),xlab="Amino acids",ylab="Frequency",cex.lab=0.8,cex.axis=0.8,cex.main=0.8)
axis(side = 1,at = seq(1,20),labels =rownames(starts_both),cex.axis=0.7)
points(y=c(starts_both[,3]/sum(starts_both[,3])),x=c(seq(1,20)),col=adjustcolor("red",alpha.f = 0.7),type="l")
points(y=c(starts_both[,4]/sum(starts_both[,4])),x=c(seq(1,20)),col=adjustcolor("goldenrod1",alpha.f = 0.7),type="l")
segments(x0 = seq(1,20),x1 = seq(1,20),y0=rep(0,20),y1=starts_both[,2]/sum(starts_both[,2]),col="red")
segments(x0 = seq(1,20),x1 = seq(1,20),y0=rep(0,20),y1=starts_both[,1]/sum(starts_both[,1]),col="goldenrod1")
points(y=starts_both[,2]/sum(starts_both[,2]),x=seq(1,20),type="p",col="red",lwd=2,pch=20)
points(y=starts_both[,1]/sum(starts_both[,1]),x=seq(1,20),type="p",col="goldenrod1",lwd=2,pch=20)

legend("topright",legend = c("Highly translated","Occasionally translated"),
       col = adjustcolor(c("red","goldenrod1"),alpha.f = 1),pch=c(20,20),pt.cex = 1.2,
       border = c("red","goldenrod1"),box.lwd = 0,box.col = 0,box.lty = 0,x.intersp = c(1,1))

text(x = 14,y=0.3,adj = 0,labels = paste("Gini Index:",round(ineq(starts_both[,2]/sum(starts_both[,2]),type = "Gini"),2)),col="red")
text(x = 14,y=0.25,adj = 0,labels = paste("Gini Index:",round(ineq(starts_both[,1]/sum(starts_both[,1]),type = "Gini"),2)),col="goldenrod1")

sele.prop.pvals = multiple.prop.test.aa(table1 = Selectively.starts_aa,table2=Selectively.random_aa,type="g")
occ.prop.pvals  = multiple.prop.test.aa(table1 = Occasionally.starts_aa,table2=Occasionally.random_aa,type="g")

points(x=which(occ.prop.pvals < 0.05),
       y=(starts_both[,1]/sum(starts_both[,1]))[which(occ.prop.pvals < 0.05)]+0.03,
       pch="*",cex=2,col="goldenrod1")
points(x=which(sele.prop.pvals < 0.05),
       y=(starts_both[,2]/sum(starts_both[,2]))[which(sele.prop.pvals < 0.05)]+0.03,
       pch="*",cex=2,col="red")
#mtext(text = "Supplemental Figure 10",cex = 0.8,side=3,las=1,line=2,adj=1)

dev.off()
# ====================== #
#   END of Figure S10    #
# ====================== #

# ================= #
#    Figure S11     #
# ================= #

pS14A <-   ggplot(data = ancestral.randomized.nt.le,aes(x=category, y=linker_size,fill=category))+geom_violin(trim=T)+ylim(0,40) + 
  scale_fill_manual(values=c(adjustcolor('darkorchid3',alpha.f = 0.7),"white",adjustcolor('grey56',alpha.f = 0.2),adjustcolor('grey56',alpha.f = 0.7)))+
  geom_boxplot(outlier.size = 0.0,width=0.1,col=c('black'),fill=c(adjustcolor('darkorchid3',alpha.f = 0.7),"white",adjustcolor('grey56',alpha.f = 0.2),adjustcolor('grey56',alpha.f = 0.7))) + 
  theme_minimal()+ theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain")) +
  ylab('Number of residues') + xlab("") + ggtitle("Linker size") + 
  annotate(geom="text",x=3, y=30,size=7,col='grey56',label= asteriscs.multiple.wilcox.test(tableA = ancestral.randomized.nt.le[which(ancestral.randomized.nt.le$category == "ancIGORFs"),1],tableB = ancestral.randomized.nt.le[which(ancestral.randomized.nt.le$category == "scrambled\nancIGORFs"),1],type = "g")) +
  annotate(geom="text",x=2, y=30,size=7,col='grey56',label= asteriscs.multiple.wilcox.test(tableA = ancestral.randomized.nt.le[which(ancestral.randomized.nt.le$category == "ancIGORFs"),1],tableB = ancestral.randomized.nt.le[which(ancestral.randomized.nt.le$category == "artificial\nIGORFs"),1],type = "g"))+
  annotate(geom="text",x=3, y=35,size=7,col='darkorchid3',label= asteriscs.multiple.wilcox.test(tableA = ancestral.randomized.nt.le[which(ancestral.randomized.nt.le$category == "IGORFs"),1],tableB = ancestral.randomized.nt.le[which(ancestral.randomized.nt.le$category == "scrambled\nancIGORFs"),1],type = "l")) +
  annotate(geom="text",x=2, y=35,size=7,col='darkorchid3',label= asteriscs.multiple.wilcox.test(tableA = ancestral.randomized.nt.le[which(ancestral.randomized.nt.le$category == "IGORFs"),1],tableB = ancestral.randomized.nt.le[which(ancestral.randomized.nt.le$category == "artificial\nIGORFs"),1],type = "g"))

pS14B <-   ggplot(data = selectively.randomized.nt.le,aes(x=category, y=linker_size,fill=category))+geom_violin(trim=T)+ylim(0,40) + 
  scale_fill_manual(values=c(adjustcolor('darkorchid3',alpha.f = 0.7),"white",adjustcolor('red',alpha.f = 0.2),adjustcolor('red',alpha.f = 0.7)))+
  geom_boxplot(outlier.size = 0.0,width=0.1,col=c('black'),fill=c(adjustcolor('darkorchid3',alpha.f = 0.7),"white",adjustcolor('red',alpha.f = 0.2),adjustcolor('red',alpha.f = 0.7))) + 
  theme_minimal()+ theme(legend.position="none",plot.title = element_text(hjust = 0.5,size=10,face="bold"),axis.title=element_text(size=9,face="plain")) + 
  ylab('Number of residues') + xlab("") + ggtitle("Linker size") + 
  annotate(geom="text",x=3, y=30,size=7,col='red',label= asteriscs.multiple.wilcox.test(tableA = selectively.randomized.nt.le[which(selectively.randomized.nt.le$category == "highly trans."),1],tableB = selectively.randomized.nt.le[which(selectively.randomized.nt.le$category == "scrambled\nhighly trans."),1],type = "g")) +
  annotate(geom="text",x=2, y=30,size=7,col='red',label= asteriscs.multiple.wilcox.test(tableA = selectively.randomized.nt.le[which(selectively.randomized.nt.le$category == "highly trans."),1],tableB = selectively.randomized.nt.le[which(selectively.randomized.nt.le$category == "artificial\nIGORFs"),1],type = "g"))+
  annotate(geom="text",x=3, y=35,size=7,col='darkorchid3',label= asteriscs.multiple.wilcox.test(tableA = selectively.randomized.nt.le[which(selectively.randomized.nt.le$category == "IGORFs"),1],tableB = selectively.randomized.nt.le[which(selectively.randomized.nt.le$category == "scrambled\nhighly trans."),1],type = "l")) +
  annotate(geom="text",x=2, y=35,size=7,col='darkorchid3',label= asteriscs.multiple.wilcox.test(tableA = selectively.randomized.nt.le[which(selectively.randomized.nt.le$category == "IGORFs"),1],tableB = selectively.randomized.nt.le[which(selectively.randomized.nt.le$category == "artificial\nIGORFs"),1],type = "l"))

pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S11.pdf',sep=""),height = 5,width = 4)
annotate_figure(
  ggarrange(pS14A,pS14B,labels="AUTO",ncol=1,nrow = 2,font.label = list(size=12,face = "bold"))
  #,bottom = text_grob("Supplemental Figure 11", color = "black",hjust = 1.2, x = 1, face = "plain", size = 10)
)
dev.off()
# ====================== #
#   END of Figure S11    #
# ====================== #

# ================= #
#    Figure S15     #
# ================= #
#   Lowly abundant proteins display a large spectrum of aggregation propensities.
#   Protein abundances (in PPM) of all cytoplasmic proteins are plotted against their 
#   corresponding aggregation propensity calculated with TANGO. 
#   The Spearman rank correlation coefficient is -0.30 with p-value < 2.2e- 16.
pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S15.pdf',sep=""),height = 4,width = 4)
plot(info.cds$aggregation_propensity[match(abundancy$V1,info.cds$Gene)],abundancy$V2,
     pch=20,cex=0.5,frame =F,col='grey',xlim=c(0,0.6),yaxt="n",cex.main=0.8,cex.lab=0.8,cex.axis=0.8,
     ylab='Log10(Abundancy) (ppm)',xlab='Aggregation propensity')
axis(side = 2,at = seq(-2,4),labels = c("-2","-1","0","1","2","3","4"),cex.axis=0.8,las=2)
contouring = kde2d(info.cds$aggregation_propensity[match(abundancy$V1,info.cds$Gene)],abundancy$V2 ,n = 500)
contour(contouring, drawlabels=FALSE, nlevels=4, col='black', add=TRUE,lwd = 2)
text(x=0.35,y=3.5,pos = 4,
     labels = paste("Rho = ",round(cor.test(info.cds$aggregation_propensity[match(abundancy$V1,info.cds$Gene)] , 
                                            abundancy$V2,method = "spearman",exact = F)$estimate,2),sep = ""))
#mtext(text = "Supplemental Figure 15",cex = 0.8,side=3,line=1,las=1,adj=1)
text(x=0.35,y=2.8,pos = 4,labels = expression("P = 2.2x10"^-16))
dev.off()
# ====================== #
#   END of Figure S15    #
# ====================== #

# ================= #
#    Figure S17     #
# ================= #
#   Quality control for the 28-mer RPFs used for the detection of occasionally and selectively translated IGORFs for all 
#   five experiments. The left panel shows that 90% (in average) of the 28-mer RPFs are in frame with the start codon of 
#   the CDS (Frame 0). The right panel presents the number of RPFs at each nucleotide position (determined by the site P 
#   of each 28-mer) showing accumulation of signal over the CDS (reads detected only after the start codon), and a nice 
#   periodicity (of frame 0) over the 100 first nucleotides. Both these results inform us about the good quality of the 
#   RPF data in all five experiments.

R1.reads  = read.table(paste(files.path,"Script_and_data_for_figures/inputs/RiboSeq_periodicity_tabs/R1_counts_mapping.tab",sep=""))
R2.reads  = read.table(paste(files.path,"Script_and_data_for_figures/inputs/RiboSeq_periodicity_tabs/R2_counts_mapping.tab",sep=""))
R3.reads  = read.table(paste(files.path,"Script_and_data_for_figures/inputs/RiboSeq_periodicity_tabs/R3_counts_mapping.tab",sep=""))
R4.reads  = read.table(paste(files.path,"Script_and_data_for_figures/inputs/RiboSeq_periodicity_tabs/R4_counts_mapping.tab",sep=""))
R5.reads  = read.table(paste(files.path,"Script_and_data_for_figures/inputs/RiboSeq_periodicity_tabs/R5_counts_mapping.tab",sep=""))
CDS.ribo  = read.table(paste(files.path,'Script_and_data_for_figures/inputs/RiboSeq_Reads_tables/Scer_transcriptome_genes_riboreplicas_2020-12-10.tab',sep=''),header = T)

pdf(paste(files.path,'Script_and_data_for_figures/outputs/Supplemental_Fig_S17.pdf',sep=""),height = 12,width = 8)
par(mfrow=c(5,2))
R1.cds.reads = CDS.ribo[,c("covframe0_R1","covframe1_R1","covframe2_R1",'ID')]
R1.cds.reads.for.ploting = 
  cbind(R1.cds.reads[,1]/apply(R1.cds.reads[,1:3],1,sum),
        R1.cds.reads[,2]/apply(R1.cds.reads[,1:3],1,sum),
        R1.cds.reads[,3]/apply(R1.cds.reads[,1:3],1,sum))
R1.cds.reads.for.ploting = R1.cds.reads.for.ploting[complete.cases(R1.cds.reads.for.ploting),]
plot(density(R1.cds.reads.for.ploting[,1],bw = 0.05),col=adjustcolor("white",alpha.f = 0),
     ylim=c(0,10),xaxt="n",frame=F,lwd=2,
     main="GSM2147982 - CDS phasing",xlab="Percentage of reads (%)")
polygon(density(R1.cds.reads.for.ploting[,1],bw = 0.05),col=adjustcolor("seagreen4",alpha.f = 0.3),border = F)
axis(side=1,at = seq(0,1,0.2),labels = seq(0,1,0.2)*100)
polygon(density(R1.cds.reads.for.ploting[,2],bw = 0.05),col=adjustcolor("blue",alpha.f = 0.3),border = F)
polygon(density(R1.cds.reads.for.ploting[,3],bw = 0.05),col=adjustcolor("red",alpha.f = 0.3),border = F)
legend("top",
       c('Frame 0','Frame 1','Frame 2'),
       fill =c(adjustcolor(c('seagreen4','blue','red'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       #lty = c(1,1,1,1,1),
       #lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=1,horiz = F,ncol = 1)

barplot(t(R1.reads[12:62,]),beside = T,
        col=adjustcolor(c("seagreen4","blue","red"),alpha.f = 0.5),
        xaxt="n",yaxt="n",border = F,
        ylab = "Number of Reads",xlab = "Genomic positions",
        main = "GSM2147982 - CDS periodicity")
axis(side = 1,
     at = c(2.5,22.5,42.5,62.5,82.5,102.5,122.5,142.5,162.5,182.5,202.5),
     labels =c("-44","-29","-14","M","17","32","47","62","77","92","107")
)
axis(side = 2,at=pretty(t(R1.reads[12:62,])),labels = format(pretty(t(R1.reads[12:62,])),scientific=F),cex.axis=0.7,las=2)
legend("topleft",
       c('Frame 0','Frame 1','Frame 2'),
       fill =c(adjustcolor(c('seagreen4','blue','red'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       #lty = c(1,1,1,1,1),
       #lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=1,horiz = F,ncol = 1)


R2.cds.reads = CDS.ribo[,c("covframe0_R2","covframe1_R2","covframe2_R2",'ID')]
R2.cds.reads.for.ploting = 
  cbind(R2.cds.reads[,1]/apply(R2.cds.reads[,1:3],1,sum),
        R2.cds.reads[,2]/apply(R2.cds.reads[,1:3],1,sum),
        R2.cds.reads[,3]/apply(R2.cds.reads[,1:3],1,sum))
R2.cds.reads.for.ploting = R2.cds.reads.for.ploting[complete.cases(R2.cds.reads.for.ploting),]
plot(density(R2.cds.reads.for.ploting[,1],bw = 0.05),col=adjustcolor("white",alpha.f = 0),
     ylim=c(0,10),xaxt="n",frame=F,lwd=2,
     main="GSM2147983 - CDS phasing",xlab="Percentage of reads (%)")
polygon(density(R2.cds.reads.for.ploting[,1],bw = 0.05),col=adjustcolor("seagreen4",alpha.f = 0.3),border = F)
axis(side=1,at = seq(0,1,0.2),labels = seq(0,1,0.2)*100)
polygon(density(R2.cds.reads.for.ploting[,2],bw = 0.05),col=adjustcolor("blue",alpha.f = 0.3),border = F)
polygon(density(R2.cds.reads.for.ploting[,3],bw = 0.05),col=adjustcolor("red",alpha.f = 0.3),border = F)
legend("top",
       c('Frame 0','Frame 1','Frame 2'),
       fill =c(adjustcolor(c('seagreen4','blue','red'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       #lty = c(1,1,1,1,1),
       #lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=1,horiz = F,ncol = 1)

barplot(t(R2.reads[12:62,]),beside = T,
        col=adjustcolor(c("seagreen4","blue","red"),alpha.f = 0.5),
        xaxt="n",yaxt="n",border = F,
        ylab = "Number of Reads",xlab = "Genomic positions",
        main = "GSM2147983 - CDS periodicity")
axis(side = 1,
     at = c(2.5,22.5,42.5,62.5,82.5,102.5,122.5,142.5,162.5,182.5,202.5),
     labels =c("-44","-29","-14","M","17","32","47","62","77","92","107") 
)
axis(side = 2,at=pretty(t(R2.reads[12:62,])),labels = format(pretty(t(R2.reads[12:62,])),scientific=F),cex.axis=0.7,las=2)
legend("topleft",
       c('Frame 0','Frame 1','Frame 2'),
       fill =c(adjustcolor(c('seagreen4','blue','red'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       #lty = c(1,1,1,1,1),
       #lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=1,horiz = F,ncol = 1)

R3.cds.reads = CDS.ribo[,c("covframe0_R3","covframe1_R3","covframe2_R3",'ID')]
R3.cds.reads.for.ploting = 
  cbind(R3.cds.reads[,1]/apply(R3.cds.reads[,1:3],1,sum),
        R3.cds.reads[,2]/apply(R3.cds.reads[,1:3],1,sum),
        R3.cds.reads[,3]/apply(R3.cds.reads[,1:3],1,sum))
R3.cds.reads.for.ploting = R3.cds.reads.for.ploting[complete.cases(R3.cds.reads.for.ploting),]
plot(density(R3.cds.reads.for.ploting[,1],bw = 0.05),col=adjustcolor("white",alpha.f = 0),
     ylim=c(0,10),xaxt="n",frame=F,lwd=2,
     main="GSM5282046 - CDS phasing",xlab="Percentage of reads (%)")
polygon(density(R3.cds.reads.for.ploting[,1],bw = 0.05),col=adjustcolor("seagreen4",alpha.f = 0.3),border = F)
axis(side=1,at = seq(0,1,0.2),labels = seq(0,1,0.2)*100)
polygon(density(R3.cds.reads.for.ploting[,2],bw = 0.05),col=adjustcolor("blue",alpha.f = 0.3),border = F)
polygon(density(R3.cds.reads.for.ploting[,3],bw = 0.05),col=adjustcolor("red",alpha.f = 0.3),border = F)
legend("top",
       c('Frame 0','Frame 1','Frame 2'),
       fill =c(adjustcolor(c('seagreen4','blue','red'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       #lty = c(1,1,1,1,1),
       #lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=1,horiz = F,ncol = 1)

barplot(t(R3.reads[12:62,]),beside = T,
        col=adjustcolor(c("seagreen4","blue","red"),alpha.f = 0.5),
        xaxt="n",yaxt="n",border = F,
        ylab = "Number of Reads",xlab = "Genomic positions",
        main = "GSM5282046 - CDS periodicity")
axis(side = 1,
     at = c(2.5,22.5,42.5,62.5,82.5,102.5,122.5,142.5,162.5,182.5,202.5),
     labels =c("-44","-29","-14","M","17","32","47","62","77","92","107")
)
axis(side = 2,at=pretty(t(R3.reads[12:62,])),labels = format(pretty(t(R3.reads[12:62,])),scientific=F),cex.axis=0.7,las=2)
legend("topleft",
       c('Frame 0','Frame 1','Frame 2'),
       fill =c(adjustcolor(c('seagreen4','blue','red'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       #lty = c(1,1,1,1,1),
       #lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=1,horiz = F,ncol = 1)

R4.cds.reads = CDS.ribo[,c("covframe0_R4","covframe1_R4","covframe2_R4",'ID')]
R4.cds.reads.for.ploting = 
  cbind(R4.cds.reads[,1]/apply(R4.cds.reads[,1:3],1,sum),
        R4.cds.reads[,2]/apply(R4.cds.reads[,1:3],1,sum),
        R4.cds.reads[,3]/apply(R4.cds.reads[,1:3],1,sum))
R4.cds.reads.for.ploting = R4.cds.reads.for.ploting[complete.cases(R4.cds.reads.for.ploting),]
plot(density(R4.cds.reads.for.ploting[,1],bw = 0.05),col=adjustcolor("white",alpha.f = 0),
     ylim=c(0,10),xaxt="n",frame=F,lwd=2,
     main="GSM5282047 - CDS phasing",xlab="Percentage of reads (%)")
polygon(density(R4.cds.reads.for.ploting[,1],bw = 0.05),col=adjustcolor("seagreen4",alpha.f = 0.3),border = F)
axis(side=1,at = seq(0,1,0.2),labels = seq(0,1,0.2)*100)
polygon(density(R4.cds.reads.for.ploting[,2],bw = 0.05),col=adjustcolor("blue",alpha.f = 0.3),border = F)
polygon(density(R4.cds.reads.for.ploting[,3],bw = 0.05),col=adjustcolor("red",alpha.f = 0.3),border = F)
legend("top",
       c('Frame 0','Frame 1','Frame 2'),
       fill =c(adjustcolor(c('seagreen4','blue','red'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       #lty = c(1,1,1,1,1),
       #lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=1,horiz = F,ncol = 1)

barplot(t(R4.reads[12:62,]),beside = T,
        col=adjustcolor(c("seagreen4","blue","red"),alpha.f = 0.5),
        xaxt="n",yaxt="n",border = F,
        ylab = "Number of Reads",xlab = "Genomic positions",
        main = "GSM5282047 - CDS periodicity")
axis(side = 1,
     at = c(2.5,22.5,42.5,62.5,82.5,102.5,122.5,142.5,162.5,182.5,202.5),
     labels =c("-44","-29","-14","M","17","32","47","62","77","92","107") 
)
axis(side = 2,at=pretty(t(R4.reads[12:62,])),labels = format(pretty(t(R4.reads[12:62,])),scientific=F),las=2,cex.axis=0.7)
legend("topleft",
       c('Frame 0','Frame 1','Frame 2'),
       fill =c(adjustcolor(c('seagreen4','blue','red'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       #lty = c(1,1,1,1,1),
       #lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=1,horiz = F,ncol = 1)

R5.cds.reads = CDS.ribo[,c("covframe0_R5","covframe1_R5","covframe2_R5",'ID')]
R5.cds.reads.for.ploting = 
  cbind(R5.cds.reads[,1]/apply(R5.cds.reads[,1:3],1,sum),
        R5.cds.reads[,2]/apply(R5.cds.reads[,1:3],1,sum),
        R5.cds.reads[,3]/apply(R5.cds.reads[,1:3],1,sum))
R5.cds.reads.for.ploting = R5.cds.reads.for.ploting[complete.cases(R5.cds.reads.for.ploting),]
plot(density(R5.cds.reads.for.ploting[,1],bw = 0.05),col=adjustcolor("white",alpha.f = 0),
     ylim=c(0,10),xaxt="n",frame=F,lwd=2,
     main="GSM1850252 - CDS phasing",xlab="Percentage of reads (%)")
polygon(density(R5.cds.reads.for.ploting[,1],bw = 0.05),col=adjustcolor("seagreen4",alpha.f = 0.3),border = F)
axis(side=1,at = seq(0,1,0.2),labels = seq(0,1,0.2)*100)
polygon(density(R5.cds.reads.for.ploting[,2],bw = 0.05),col=adjustcolor("blue",alpha.f = 0.3),border = F)
polygon(density(R5.cds.reads.for.ploting[,3],bw = 0.05),col=adjustcolor("red",alpha.f = 0.3),border = F)
legend("top",
       c('Frame 0','Frame 1','Frame 2'),
       fill =c(adjustcolor(c('seagreen4','blue','red'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       #lty = c(1,1,1,1,1),
       #lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=1,horiz = F,ncol = 1)

barplot(t(R5.reads[12:62,]),beside = T,
        col=adjustcolor(c("seagreen4","blue","red"),alpha.f = 0.5),
        xaxt="n",yaxt="n",border = F,
        ylab = "Number of Reads",xlab = "Genomic positions",
        main = "GSM1850252 - CDS periodicity")
axis(side = 1,
     at = c(2.5,22.5,42.5,62.5,82.5,102.5,122.5,142.5,162.5,182.5,202.5),
     labels =c("-44","-29","-14","M","17","32","47","62","77","92","107")
)
axis(side = 2,at=pretty(t(R5.reads[12:62,])),labels = format(pretty(t(R5.reads[12:62,])),scientific=F),cex.axis=0.7,las=2)
legend("topleft",
       c('Frame 0','Frame 1','Frame 2'),
       fill =c(adjustcolor(c('seagreen4','blue','red'),alpha.f = 0.3)),
       lty = c(0),
       border = 0,
       #lty = c(1,1,1,1,1),
       #lwd=2,
       bg=F,
       box.lty = 0,
       box.lwd = 0,
       cex=1,horiz = F,ncol = 1)

#mtext(text = "Supplemental figure 17",side = 1,line = 3.5,cex = 0.6,adj = 1)
dev.off()



# ====================== #
#   END of Figure S10    #
# ====================== #


# ================ #
#     Table S5     #
# ================ #
#   Strong hydrophobic residues (V,I,L,F,M,Y,W) frequency per category of sequences for the three HCA score categories.
# ----------- #
#   IGORFs    #
# ----------- #
igorf.residues.types        = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence))
igorf.residues.types.low    = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(info.igorf$Seq_ID[which(info.igorf$HCA_bin == "low")],igorf.fasta$ID)]))
igorf.residues.types.int    = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(info.igorf$Seq_ID[which(info.igorf$HCA_bin == "intermediate")],igorf.fasta$ID)]))
igorf.residues.types.high   = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(info.igorf$Seq_ID[which(info.igorf$HCA_bin == "high")],igorf.fasta$ID)]))
# ---------------- #
#   occasionally   #
# ---------------- #
occasionally.residues.types      = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(Occasionally.translated$Seq_ID,igorf.fasta$ID)]))
occasionally.residues.types.low  = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(Occasionally.translated$Seq_ID[which(Occasionally.translated$HCA_score < lower.limit)],igorf.fasta$ID)]))
occasionally.residues.types.high = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(Occasionally.translated$Seq_ID[which(Occasionally.translated$HCA_score > higher.limit)],igorf.fasta$ID)]))
occasionally.residues.types.int  = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(Occasionally.translated$Seq_ID[which(Occasionally.translated$HCA_score >= lower.limit & Occasionally.translated$HCA_score <= higher.limit)],igorf.fasta$ID)]))
# ---------------- #
#   selectively    #
# ---------------- #
selectively.residues.types      = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(Selectively.translated$Seq_ID,igorf.fasta$ID)]))
selectively.residues.types.low  = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(Selectively.translated$Seq_ID[which(Selectively.translated$HCA_score < lower.limit)],igorf.fasta$ID)]))
selectively.residues.types.high = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(Selectively.translated$Seq_ID[which(Selectively.translated$HCA_score > higher.limit)],igorf.fasta$ID)]))
selectively.residues.types.int  = AA_frequencies_types(aa_tab = gsub("\\*", "",  igorf.fasta$Sequence[match(Selectively.translated$Seq_ID[which(Selectively.translated$HCA_score >= lower.limit & Selectively.translated$HCA_score <= higher.limit)],igorf.fasta$ID)]))
# -------------- #
#   AncIGORFs    #
# -------------- #
ancestral.residues.types      = AA_frequencies_types(aa_tab = gsub("\\*", "",  ancestral.igorf$ancIGORF_aa_sequence))
ancestral.residues.types.low  = AA_frequencies_types(aa_tab = gsub("\\*", "",  ancestral.igorf$ancIGORF_aa_sequence[which(ancestral.igorf$ancIGORF_HCA_score < lower.limit)]))
ancestral.residues.types.high = AA_frequencies_types(aa_tab = gsub("\\*", "",  ancestral.igorf$ancIGORF_aa_sequence[which(ancestral.igorf$ancIGORF_HCA_score > higher.limit)]))
ancestral.residues.types.int  = AA_frequencies_types(aa_tab = gsub("\\*", "",  ancestral.igorf$ancIGORF_aa_sequence[which(ancestral.igorf$ancIGORF_HCA_score >= lower.limit & ancestral.igorf$ancIGORF_HCA_score <= higher.limit)]))
# ----------- #
#   de novo   #
# ----------- #
denovo.residues.types       = AA_frequencies_types(aa_tab = gsub("\\*", "",  cds.fasta$Sequence[match(de.novo.info$Gene,cds.fasta$ID)]))
denovo.residues.types.low   = AA_frequencies_types(aa_tab = gsub("\\*", "",  cds.fasta$Sequence[match(de.novo.info$Gene[which(de.novo.info$HCA_bin == "low")],cds.fasta$ID)]))
denovo.residues.types.int   = AA_frequencies_types(aa_tab = gsub("\\*", "",  cds.fasta$Sequence[match(de.novo.info$Gene[which(de.novo.info$HCA_bin == "intermediate")],cds.fasta$ID)]))
denovo.residues.types.high  = AA_frequencies_types(aa_tab = gsub("\\*", "",  cds.fasta$Sequence[match(de.novo.info$Gene[which(de.novo.info$HCA_bin == "high")],cds.fasta$ID)]))
# ------- #
#   CDS   #
# ------- #
cds.residues.types          = AA_frequencies_types(aa_tab = gsub("\\*", "",  cds.fasta$Sequence))
cds.residues.types.low      = AA_frequencies_types(aa_tab = gsub("\\*", "",  cds.fasta$Sequence[match(info.cds$Gene[which(info.cds$HCA_bin == "low")],cds.fasta$ID)]))
cds.residues.types.int      = AA_frequencies_types(aa_tab = gsub("\\*", "",  cds.fasta$Sequence[match(info.cds$Gene[which(info.cds$HCA_bin == "intermediate")],cds.fasta$ID)]))
cds.residues.types.high     = AA_frequencies_types(aa_tab = gsub("\\*", "",  cds.fasta$Sequence[match(info.cds$Gene[which(info.cds$HCA_bin == "high")],cds.fasta$ID)]))

hydrophobicities = 
  rbind(
    c(igorf.residues.types[1],igorf.residues.types.low[1],igorf.residues.types.int[1],igorf.residues.types.high[1]),
    c(occasionally.residues.types[1],occasionally.residues.types.low[1],occasionally.residues.types.int[1],occasionally.residues.types.high[1]),
    c(selectively.residues.types[1],selectively.residues.types.low[1],selectively.residues.types.int[1],selectively.residues.types.high[1]),
    c(ancestral.residues.types[1],ancestral.residues.types.low[1],ancestral.residues.types.int[1],ancestral.residues.types.high[1]),
    c(denovo.residues.types[1],denovo.residues.types.low[1],denovo.residues.types.int[1],denovo.residues.types.high[1]),
    c(cds.residues.types[1],cds.residues.types.low[1],cds.residues.types.int[1],cds.residues.types.high[1])
        )

colnames(hydrophobicities) = c("Total","Low","Intermediate","High")
rownames(hydrophobicities) = c("IGORFs","Occasionally translated","Selectivelly translated","Ancestral IGORFs","De novo genes","CDS")
write.csv(hydrophobicities,file = paste(files.path,"Script_and_data_for_figures/outputs/Supplemental_table_S5.csv",sep=""),quote = F,row.names = T)
# ===================== #
#   END of Table S5     #
# ===================== #


# ================ #
#     Table S1     #
# ================ #
#   One-sided Mann-Whitney U-test p-values - Protein sequence size
tableS1 = matrix(nrow = 6,ncol = 6)
colnames(tableS1) = c("IGORFs","Occasionally","Highly","AncIGORFs","Denovo","CDS")
rownames(tableS1) = c("IGORFs","Occasionally","Highly","AncIGORFs","Denovo","CDS")
tableS1[1,2] = multiple.wilcox.test(info.igorf$prot_size,Occasionally.translated$prot_size,type = "l")
tableS1[1,3] = multiple.wilcox.test(info.igorf$prot_size,Selectively.translated$prot_size,type = "l")
tableS1[1,4] = multiple.wilcox.test(info.igorf$prot_size,nchar(ancestral.igorf$ancIGORF_aa_sequence),type = "l")
tableS1[1,5] = multiple.wilcox.test(info.igorf$prot_size,de.novo.info$prot_size,type = "l")
tableS1[1,6] = multiple.wilcox.test(info.igorf$prot_size,info.cds$prot_size,type = "l")
tableS1[2,3] = multiple.wilcox.test(Occasionally.translated$prot_size,Selectively.translated$prot_size,type = "l")
tableS1[2,4] = multiple.wilcox.test(Occasionally.translated$prot_size,nchar(ancestral.igorf$ancIGORF_aa_sequence),type = "l")
tableS1[2,5] = multiple.wilcox.test(Occasionally.translated$prot_size,de.novo.info$prot_size,type = "l")
tableS1[2,6] = multiple.wilcox.test(Occasionally.translated$prot_size,info.cds$prot_size,type = "l")
tableS1[3,4] = multiple.wilcox.test(Selectively.translated$prot_size,nchar(ancestral.igorf$ancIGORF_aa_sequence),type = "l")
tableS1[3,5] = multiple.wilcox.test(Selectively.translated$prot_size,de.novo.info$prot_size,type = "l")
tableS1[3,6] = multiple.wilcox.test(Selectively.translated$prot_size,info.cds$prot_size,type = "l")
tableS1[4,5] = multiple.wilcox.test(nchar(ancestral.igorf$ancIGORF_aa_sequence),de.novo.info$prot_size,type = "l")
tableS1[4,6] = multiple.wilcox.test(nchar(ancestral.igorf$ancIGORF_aa_sequence),info.cds$prot_size,type = "l")
tableS1[5,6] = multiple.wilcox.test(de.novo.info$prot_size,info.cds$prot_size,type = "l")
write.csv(file = paste(files.path,"Script_and_data_for_figures/outputs/Supplemental_table_S1.csv",sep=""),x=tableS1,quote = F,row.names = T,col.names = T)
# ===================== #
#   END of Table S1     #
# ===================== #

# ================ #
#     Table S2     #
# ================ #
#   One-sided Mann-Whitney U-test - Number of clusters
tableS2 = matrix(nrow = 6,ncol = 6)
colnames(tableS2) = c("IGORFs","Occasionally","Highly","AncIGORFs","Denovo","CDS")
rownames(tableS2) = c("IGORFs","Occasionally","Highly","AncIGORFs","Denovo","CDS")
tableS2[1,2] = multiple.wilcox.test(lengths(igorf.cluster.size),lengths(Occasionally.translated.clusters),type = "l")
tableS2[1,3] = multiple.wilcox.test(lengths(igorf.cluster.size),lengths(Selectively.translated.clusters),type = "l")
tableS2[1,4] = multiple.wilcox.test(lengths(igorf.cluster.size),lengths(ancestral.igorf.cluster),type = "l")
tableS2[1,5] = multiple.wilcox.test(lengths(igorf.cluster.size),lengths(de.novo.cluster),type = "l")
tableS2[1,6] = multiple.wilcox.test(lengths(igorf.cluster.size),lengths(cds.cluster.size),type = "l")
tableS2[2,3] = multiple.wilcox.test(lengths(Occasionally.translated.clusters),lengths(Selectively.translated.clusters),type = "l")
tableS2[2,4] = multiple.wilcox.test(lengths(Occasionally.translated.clusters),lengths(ancestral.igorf.cluster),type = "l")
tableS2[2,5] = multiple.wilcox.test(lengths(Occasionally.translated.clusters),lengths(de.novo.cluster),type = "l")
tableS2[2,6] = multiple.wilcox.test(lengths(Occasionally.translated.clusters),lengths(cds.cluster.size),type = "l")
tableS2[3,4] = multiple.wilcox.test(lengths(Selectively.translated.clusters),lengths(ancestral.igorf.cluster),type = "l")
tableS2[3,5] = multiple.wilcox.test(lengths(Selectively.translated.clusters),lengths(de.novo.cluster),type = "l")
tableS2[3,6] = multiple.wilcox.test(lengths(Selectively.translated.clusters),lengths(cds.cluster.size),type = "l")
tableS2[4,5] = multiple.wilcox.test(lengths(ancestral.igorf.cluster),lengths(de.novo.cluster),type = "l")
tableS2[4,6] = multiple.wilcox.test(lengths(ancestral.igorf.cluster),lengths(cds.cluster.size),type = "l")
tableS2[5,6] = multiple.wilcox.test(lengths(de.novo.cluster),lengths(cds.cluster.size),type = "l")
write.csv(file = paste(files.path,"Script_and_data_for_figures/outputs/Supplemental_table_S2.csv",sep=""),x=tableS2,quote = F,row.names = T,col.names = T)
# ===================== #
#   END of Table S2     #
# ===================== #


# ================ #
#     Table S3     #
# ================ #
#   Two-sided Mann-Whitney U-test - Cluster size
tableS3 = matrix(nrow = 6,ncol = 6)
colnames(tableS3) = c("IGORFs","Occasionally","Highly","AncIGORFs","Denovo","CDS")
rownames(tableS3) = c("IGORFs","Occasionally","Highly","AncIGORFs","Denovo","CDS")
tableS3[1,2] = multiple.wilcox.test(unlist(igorf.cluster.size),unlist(Occasionally.translated.clusters),type = "two.sided")
tableS3[1,3] = multiple.wilcox.test(unlist(igorf.cluster.size),unlist(Selectively.translated.clusters),type = "two.sided")
tableS3[1,4] = multiple.wilcox.test(unlist(igorf.cluster.size),unlist(ancestral.igorf.cluster),type = "two.sided")
tableS3[1,5] = multiple.wilcox.test(unlist(igorf.cluster.size),unlist(de.novo.cluster),type = "two.sided")
tableS3[1,6] = multiple.wilcox.test(unlist(igorf.cluster.size),unlist(cds.cluster.size),type = "two.sided")
tableS3[2,3] = multiple.wilcox.test(unlist(Occasionally.translated.clusters),unlist(Selectively.translated.clusters),type = "two.sided")
tableS3[2,4] = multiple.wilcox.test(unlist(Occasionally.translated.clusters),unlist(ancestral.igorf.cluster),type = "two.sided")
tableS3[2,5] = multiple.wilcox.test(unlist(Occasionally.translated.clusters),unlist(de.novo.cluster),type = "two.sided")
tableS3[2,6] = multiple.wilcox.test(unlist(Occasionally.translated.clusters),unlist(cds.cluster.size),type = "two.sided")
tableS3[3,4] = multiple.wilcox.test(unlist(Selectively.translated.clusters),unlist(ancestral.igorf.cluster),type = "two.sided")
tableS3[3,5] = multiple.wilcox.test(unlist(Selectively.translated.clusters),unlist(de.novo.cluster),type = "two.sided")
tableS3[3,6] = multiple.wilcox.test(unlist(Selectively.translated.clusters),unlist(cds.cluster.size),type = "two.sided")
tableS3[4,5] = multiple.wilcox.test(unlist(ancestral.igorf.cluster),unlist(de.novo.cluster),type = "two.sided")
tableS3[4,6] = multiple.wilcox.test(unlist(ancestral.igorf.cluster),unlist(cds.cluster.size),type = "two.sided")
tableS3[5,6] = multiple.wilcox.test(unlist(de.novo.cluster),unlist(cds.cluster.size),type = "two.sided")
write.csv(file = paste(files.path,"Script_and_data_for_figures/outputs/Supplemental_table_S3.csv",sep=""),x=tableS3,quote = F,row.names = T,col.names = T)
# ===================== #
#   END of Table S3     #
# ===================== #

# ================ #
#     Table S4     #
# ================ #
#   One-sided Mann-Whitney U-test - Linker Size
tableS4 = matrix(nrow = 6,ncol = 6)
colnames(tableS4) = c("IGORFs","Occasionally","Highly","AncIGORFs","Denovo","CDS")
rownames(tableS4) = c("IGORFs","Occasionally","Highly","AncIGORFs","Denovo","CDS")
tableS4[1,2] = multiple.wilcox.test( c(unlist(igorf.linker.size),unlist(igorf.extremities.size)), c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)),type = "l")
tableS4[1,3] = multiple.wilcox.test( c(unlist(igorf.linker.size),unlist(igorf.extremities.size)), c(unlist(Selectively.translated.linkers) ,unlist(Selectively.translated.extremities)) ,type = "l")
tableS4[1,4] = multiple.wilcox.test( c(unlist(igorf.linker.size),unlist(igorf.extremities.size)), c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),type = "l")
tableS4[1,5] = multiple.wilcox.test( c(unlist(igorf.linker.size),unlist(igorf.extremities.size)), c(unlist(de.novo.linker),unlist(de.novo.extremities)),type = "l")
tableS4[1,6] = multiple.wilcox.test( c(unlist(igorf.linker.size),unlist(igorf.extremities.size)), c(unlist(cds.linker.size),unlist(cds.extremities.size)),type = "l")
tableS4[2,3] = multiple.wilcox.test( c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)), c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)),type = "l")
tableS4[2,4] = multiple.wilcox.test( c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)), c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),type = "l")
tableS4[2,5] = multiple.wilcox.test( c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)), c(unlist(de.novo.linker),unlist(de.novo.extremities)),type = "l")
tableS4[2,6] = multiple.wilcox.test( c(unlist(Occasionally.translated.linkers),unlist(Occasionally.translated.extremities)), c(unlist(cds.linker.size),unlist(cds.extremities.size)),type = "l")
tableS4[3,4] = multiple.wilcox.test( c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)), c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)),type = "l")
tableS4[3,5] = multiple.wilcox.test( c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)), c(unlist(de.novo.linker),unlist(de.novo.extremities)),type = "l")
tableS4[3,6] = multiple.wilcox.test( c(unlist(Selectively.translated.linkers),unlist(Selectively.translated.extremities)), c(unlist(cds.linker.size),unlist(cds.extremities.size)),type = "l")
tableS4[4,5] = multiple.wilcox.test( c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)), c(unlist(de.novo.linker),unlist(de.novo.extremities)),type = "l")
tableS4[4,6] = multiple.wilcox.test( c(unlist(ancestral.igorf.linker),unlist(ancestral.igorf.extremities)), c(unlist(cds.linker.size),unlist(cds.extremities.size)),type = "l")
tableS4[5,6] = multiple.wilcox.test( c(unlist(de.novo.linker),unlist(de.novo.extremities)), c(unlist(cds.linker.size),unlist(cds.extremities.size)),type = "l")
write.csv(file = paste(files.path,"Script_and_data_for_figures/outputs/Supplemental_table_S4.csv",sep=""),x=tableS4,quote = F,row.names = T,col.names = T)
# ===================== #
#   END of Table S4     #
# ===================== #


# ====================================== #
#  De novo genes information - Table S6
# ====================================== #
apply(as.data.frame(cds.fasta[match(de.novo.info$Gene,cds.fasta$ID),2]),1,AA_frequencies_types)[1,]
apply(as.data.frame(cds.fasta[match(de.novo.info$Gene,cds.fasta$ID),2]),1,AA_frequencies_types)[2,]

DENOVO = cbind(de.novo.info[,-c(1,8,9)])
DENOVO = cbind(DENOVO,ancestral.igorf$gene_type[match(DENOVO$Gene,ancestral.igorf$gene_name)])             
DENOVO = cbind(DENOVO,lengths(cds.cluster.size[match(DENOVO$Gene,info.cds$Gene)]))             
DENOVO = cbind(DENOVO,apply(as.data.frame(cds.fasta[match(de.novo.info$Gene,cds.fasta$ID),2]),1,AA_frequencies_types)[1,])             
DENOVO = cbind(DENOVO,apply(as.data.frame(cds.fasta[match(de.novo.info$Gene,cds.fasta$ID),2]),1,AA_frequencies_types)[2,])             

colnames(DENOVO) = c("Gene_name","Protein_size","HCA_score","HCA_bin",
                     "disorder_propensity","aggregation_propensity",
                     "Ancestral_type","Clusters_count","Hydrophobic_percentage",
                     "Hydrophilic_percentage")

write.csv(DENOVO,file = paste(files.path,"Script_and_data_for_figures/outputs/Supplemental_table_S6.csv",sep=""),quote = F,row.names = F)
# ===================== #
#   END of Table S6     #
# ===================== #


# ================ #
#     Table S7     #
# ================ #
# Stop codons frequencies 
cds.stop.codons   = codons.frequencies(cds.nt.fasta$Sequence,codons.list = c("TAA","TAG","TGA"))
igorf.stop.codons = codons.frequencies(igorf.nt.fasta$Sequence,codons.list = c("TAA","TAG","TGA"))
occasionally.stop.codons   = codons.frequencies(igorf.nt.fasta$Sequence[match(Occasionally.translated$Seq_ID,igorf.nt.fasta$ID)],codons.list = c("TAA","TAG","TGA"))
selectively.stop.codons    = codons.frequencies(igorf.nt.fasta$Sequence[match(Selectively.translated$Seq_ID,igorf.nt.fasta$ID)],codons.list = c("TAA","TAG","TGA"))

STOP.freqs = rbind(
  igorf.stop.codons,
  occasionally.stop.codons,
  selectively.stop.codons,
  cds.stop.codons
)
rownames(STOP.freqs) = c("IGORFs","Occasionally translated","Selectively translated","CDS")
colnames(STOP.freqs) = c("UAA","UAG","UGA")
write.csv(STOP.freqs,file = paste(files.path,"Script_and_data_for_figures/outputs/Supplemental_table_S7.csv",sep=""),quote = F,row.names = T)
# ===================== #
#   END of Table S7     #
# ===================== #
