args = commandArgs(trailingOnly=TRUE)
setwd(args[1])


assign_with_evalue_threshold = function(tab,threshold){
  tab$Seq_type = as.character(tab$Seq_type)
  tab[which(tab$Seq_MEGA_eval > threshold),c('Seq_type')] <- 'NaN'
  #tab[which(tab$Seq_type == 'CDS' & tab$Seq_MEGA_cov <= 0.70),c('Seq_type')] <- 'CDS?'
  return(tab)
}

correct_to_see = function(table){
  for (i in which(table$Annotation == 'To_See')){
    which.cds = which(table[i,c(3:7)] == 'CDS')
    if (paste(which.cds,sep='',collapse = '') == paste(seq(1:length(which.cds)),sep='',collapse = '')){
      
      if('IGORF' %in% unlist(c(table[i,c(3:7)]))){table$Annotation[i] <- 'KEEP'}
      else if('IGR' %in% unlist(c(table[i,c(3:7)]))){table$Annotation[i] <- 'KEEP'}}
      else{next}
    }
  return(table)
}

Scer.vs.SparCDS = read.table('denovo_vs_CDS_tab/Scer-vs-SparCDS.tab',header = T)
Scer.vs.SarbCDS = read.table('denovo_vs_CDS_tab/Scer-vs-SarbCDS.tab',header = T)
Scer.vs.SmikCDS = read.table('denovo_vs_CDS_tab/Scer-vs-SmikCDS.tab',header = T)
Scer.vs.SkudCDS = read.table('denovo_vs_CDS_tab/Scer-vs-SkudCDS.tab',header = T)
Scer.vs.SbayCDS = read.table('denovo_vs_CDS_tab/Scer-vs-SbayCDS.tab',header = T)
Scer.vs.SparCDS = assign_with_evalue_threshold(tab = Scer.vs.SparCDS , threshold = 0.01)
Scer.vs.SarbCDS = assign_with_evalue_threshold(tab = Scer.vs.SarbCDS , threshold = 0.01)
Scer.vs.SmikCDS = assign_with_evalue_threshold(tab = Scer.vs.SmikCDS , threshold = 0.01)
Scer.vs.SkudCDS = assign_with_evalue_threshold(tab = Scer.vs.SkudCDS , threshold = 0.01)
Scer.vs.SbayCDS = assign_with_evalue_threshold(tab = Scer.vs.SbayCDS , threshold = 0.01)


Scer.vs.SparIGR = read.table('denovo_vs_IGR_tab/Scer-vs-SparIGR.tab',header = T)
Scer.vs.SarbIGR = read.table('denovo_vs_IGR_tab/Scer-vs-SarbIGR.tab',header = T)
Scer.vs.SmikIGR = read.table('denovo_vs_IGR_tab/Scer-vs-SmikIGR.tab',header = T)
Scer.vs.SkudIGR = read.table('denovo_vs_IGR_tab/Scer-vs-SkudIGR.tab',header = T)
Scer.vs.SbayIGR = read.table('denovo_vs_IGR_tab/Scer-vs-SbayIGR.tab',header = T)
Scer.vs.SparIGR = assign_with_evalue_threshold(tab = Scer.vs.SparIGR , threshold = 0.01)
Scer.vs.SarbIGR = assign_with_evalue_threshold(tab = Scer.vs.SarbIGR , threshold = 0.01)
Scer.vs.SmikIGR = assign_with_evalue_threshold(tab = Scer.vs.SmikIGR , threshold = 0.01)
Scer.vs.SkudIGR = assign_with_evalue_threshold(tab = Scer.vs.SkudIGR , threshold = 0.01)
Scer.vs.SbayIGR = assign_with_evalue_threshold(tab = Scer.vs.SbayIGR , threshold = 0.01)

Scer.vs.Spar = read.table('denovo_vs_all_ORFs_more_than_12nt_tab/Scer-vs-Spar.tab',header = T)
Scer.vs.Sarb = read.table('denovo_vs_all_ORFs_more_than_12nt_tab/Scer-vs-Sarb.tab',header = T)
Scer.vs.Smik = read.table('denovo_vs_all_ORFs_more_than_12nt_tab/Scer-vs-Smik.tab',header = T)
Scer.vs.Skud = read.table('denovo_vs_all_ORFs_more_than_12nt_tab/Scer-vs-Skud.tab',header = T)
Scer.vs.Sbay = read.table('denovo_vs_all_ORFs_more_than_12nt_tab/Scer-vs-Sbay.tab',header = T)
Scer.vs.Spar = assign_with_evalue_threshold(tab = Scer.vs.Spar , threshold = 0.01)
Scer.vs.Sarb = assign_with_evalue_threshold(tab = Scer.vs.Sarb , threshold = 0.01)
Scer.vs.Smik = assign_with_evalue_threshold(tab = Scer.vs.Smik , threshold = 0.01)
Scer.vs.Skud = assign_with_evalue_threshold(tab = Scer.vs.Skud , threshold = 0.01)
Scer.vs.Sbay = assign_with_evalue_threshold(tab = Scer.vs.Sbay , threshold = 0.01)

Scer.vs.Spar$Seq_type[which(endsWith(Scer.vs.Spar$Seq_id,suffix ="nc_ovp_opp-CDS") & Scer.vs.Spar$Seq_type == "CDS")] = "IGORF"
Scer.vs.Smik$Seq_type[which(endsWith(Scer.vs.Smik$Seq_id,suffix ="nc_ovp_opp-CDS") & Scer.vs.Smik$Seq_type == "CDS")] = "IGORF"
Scer.vs.Skud$Seq_type[which(endsWith(Scer.vs.Skud$Seq_id,suffix ="nc_ovp_opp-CDS") & Scer.vs.Skud$Seq_type == "CDS")] = "IGORF"
Scer.vs.Sarb$Seq_type[which(endsWith(Scer.vs.Sarb$Seq_id,suffix ="nc_ovp_opp-CDS") & Scer.vs.Sarb$Seq_type == "CDS")] = "IGORF"
Scer.vs.Sbay$Seq_type[which(endsWith(Scer.vs.Sbay$Seq_id,suffix ="nc_ovp_opp-CDS") & Scer.vs.Sbay$Seq_type == "CDS")] = "IGORF"
################
Scer.vs.Spar$Seq_type[which(endsWith(Scer.vs.Spar$Seq_id,suffix ="nc_ovp_same-CDS") & Scer.vs.Spar$Seq_type == "CDS")] = "IGORF"
Scer.vs.Smik$Seq_type[which(endsWith(Scer.vs.Smik$Seq_id,suffix ="nc_ovp_same-CDS") & Scer.vs.Smik$Seq_type == "CDS")] = "IGORF"
Scer.vs.Skud$Seq_type[which(endsWith(Scer.vs.Skud$Seq_id,suffix ="nc_ovp_same-CDS") & Scer.vs.Skud$Seq_type == "CDS")] = "IGORF"
Scer.vs.Sarb$Seq_type[which(endsWith(Scer.vs.Sarb$Seq_id,suffix ="nc_ovp_same-CDS") & Scer.vs.Sarb$Seq_type == "CDS")] = "IGORF"
Scer.vs.Sbay$Seq_type[which(endsWith(Scer.vs.Sbay$Seq_id,suffix ="nc_ovp_same-CDS") & Scer.vs.Sbay$Seq_type == "CDS")] = "IGORF"



decide_line = function(table1,table2,table3){
  my.new.tab = NULL
  for (i in seq(1,nrow(table1))){
    if(table3$Seq_type[i] == "CDS"){my.new.tab = rbind(my.new.tab,table3[i,])}
    else if (table1$Seq_type[i] == "IGORF" & table2$Seq_type[i] == "IGR"){my.new.tab = rbind(my.new.tab,table1[i,])}
    else if (table1$Seq_type[i] == "IGORF" & table2$Seq_type[i] == "NaN"){my.new.tab = rbind(my.new.tab,table1[i,])}
    else if (table1$Seq_type[i] == "CDS" & table2$Seq_type[i] == "NaN"){my.new.tab = rbind(my.new.tab,table1[i,])}
    else if (table1$Seq_type[i] == "NaN" & table2$Seq_type[i] == "NaN"){my.new.tab = rbind(my.new.tab,table1[i,])}
    else if (table1$Seq_type[i] == "NaN" & table2$Seq_type[i] == "IGR"){my.new.tab = rbind(my.new.tab,table2[i,])}
    else if (table1$Seq_type[i] == "CDS" & table2$Seq_type[i] == "IGR"){ 
                      #print(table1$Seq_id[i])
                      if (table1$Seq_MEGA_cov[i] > table2$Seq_MEGA_cov[i]){my.new.tab = rbind(my.new.tab,table1[i,])
                      print(paste("Descision:",table1$Seq_id[i]))}
                      else{my.new.tab = rbind(my.new.tab,table2[i,])
                      print(paste("Descision:",table2$Seq_id[i]))}}
  }
  return(my.new.tab)
}

Scer.vs.Spar.final = decide_line(table1=Scer.vs.Spar,table2=Scer.vs.SparIGR,table3 = Scer.vs.SparCDS)
Scer.vs.Smik.final = decide_line(table1=Scer.vs.Smik,table2=Scer.vs.SmikIGR,table3 = Scer.vs.SmikCDS)
Scer.vs.Skud.final = decide_line(table1=Scer.vs.Skud,table2=Scer.vs.SkudIGR,table3 = Scer.vs.SkudCDS)
Scer.vs.Sarb.final = decide_line(table1=Scer.vs.Sarb,table2=Scer.vs.SarbIGR,table3 = Scer.vs.SarbCDS)   
Scer.vs.Sbay.final = decide_line(table1=Scer.vs.Sbay,table2=Scer.vs.SbayIGR,table3 = Scer.vs.SbayCDS)

# Here we detect the genes that are CDS to all the speaces
# Either wrong de novo thought or ancient de novo in the famili of Sacvharomycetacea
all.CDS = intersect(which(Scer.vs.Sbay.final$Seq_type == 'CDS'),
                    intersect (which(Scer.vs.Sarb.final$Seq_type == 'CDS'),
                               intersect( which(Scer.vs.Skud.final$Seq_type == 'CDS')  ,
                                          intersect(which(Scer.vs.Spar.final$Seq_type == 'CDS') , which(Scer.vs.Smik.final$Seq_type == 'CDS')) 
                               )))
# Here I detect genes Strict to Scer that have NO detetactable sequence 
# to any other species --> Fast Devergent 
all.NAN = intersect(which(Scer.vs.Sbay.final$Seq_type == 'NaN'),
                    intersect (which(Scer.vs.Sarb.final$Seq_type == 'NaN'),
                               intersect( which(Scer.vs.Skud.final$Seq_type =='NaN')  ,
                                          intersect(which(Scer.vs.Spar.final$Seq_type == 'NaN') , which(Scer.vs.Smik.final$Seq_type == 'NaN')) 
                               )))

# Here i Keep the genes that are the ones that must keep
# as are the easy ones to reconstruct their ancestral sequence
interesting.genes = 
  intersect(which(Scer.vs.Sbay.final$Seq_type == 'IGORF' | Scer.vs.Sbay.final$Seq_type == 'NaN' | Scer.vs.Sbay.final$Seq_type == 'IGR'),
            intersect(which(Scer.vs.Sarb.final$Seq_type == 'IGORF' | Scer.vs.Sarb.final$Seq_type == 'NaN' | Scer.vs.Sarb.final$Seq_type == 'IGR'),
                      intersect(which(Scer.vs.Skud.final$Seq_type == 'IGORF' | Scer.vs.Skud.final$Seq_type == 'NaN' | Scer.vs.Skud.final$Seq_type == 'IGR' ),
                                intersect( which(Scer.vs.Spar.final$Seq_type == 'NaN' | Scer.vs.Spar.final$Seq_type == 'IGORF' | Scer.vs.Spar.final$Seq_type == 'IGR') , which(Scer.vs.Smik.final$Seq_type == 'IGORF' | Scer.vs.Smik.final$Seq_type == 'NaN' | Scer.vs.Smik.final$Seq_type == 'IGR') )
                      )))

my.table = 
  as.data.frame(cbind(as.character(Scer.vs.Spar.final$Scer_gene),as.character(Scer.vs.Spar.final$Scer_gene_name),
                      Scer.vs.Spar.final$Seq_type , 
                      Scer.vs.Smik.final$Seq_type , 
                      Scer.vs.Skud.final$Seq_type ,
                      Scer.vs.Sarb.final$Seq_type ,
                      Scer.vs.Sbay.final$Seq_type ,
                      array(dim=length(Scer.vs.Spar.final$Scer_gene),data = 'To_See')))
#as.character(info.cds$AA_sequence[match(as.character(Scer.vs.Spar$Scer_gene) , info.cds$ID)])))
colnames(my.table) = c('Gene_code','Gene_name','Spar','Smik','Skud','Sarb','Sbay','Annotation')
my.table$Annotation = as.character(my.table$Annotation)
my.table$Annotation[all.CDS] <- 'Ancient_De_novo'
my.table$Annotation[interesting.genes] <- 'KEEP'
my.table$Annotation[all.NAN] <- 'Fast_divergent'
my.table$Annotation[which(apply(my.table,1,function(x) sum(x=='NaN')) == 4)] <- 'Not_enough'
my.table = correct_to_see(table = my.table)



#rec = gene2code$V2[match(my.table$Gene_code[which(my.table$Annotation == "KEEP")],gene2code$V1)]
#ancestral.igorf[match(my.table$Gene_code[which(my.table$Annotation == "KEEP")],ancestral.igorf$gene_name),]
#anc.new = NULL
#for (i in my.table$Gene_code[which(my.table$Annotation == "KEEP")]){
#  anc.new = rbind(anc.new,ancestral.igorf[which(ancestral.igorf$gene_name == i),])
#}

to_keep = my.table[which(my.table$Annotation == 'KEEP'),c(1:8)]
level = NULL
for (i in seq(1,nrow(to_keep))){
  level = c(level,which(to_keep[i,c(3:7)] == "IGORF"|to_keep[i,c(3:7)] == "IGR")[1])}

to_keep = cbind(to_keep,level)

#write.table(my.table[which(my.table$Annotation == 'KEEP'),c(1:8)],file=paste(args[1],'/TO_KEEP.tab',sep=''),quote = F,row.names = F)
write.table(to_keep,file='TO_KEEP.tab',quote = F,row.names = F)

