

source("../headers.R")
source("../functions.R")



img_dir=paste0("images/")
base_result_dir=paste0("results/")

dir.create(img_dir,showWarnings=F,  recursive=T)
dir.create(base_result_dir,showWarnings=F,  recursive=T)



groups=read.table("../IES_Excision_Score/IES_Groups.tsv",h=T,sep="\t")
group_names=c("Very early","Early","Intermediate","Late","None")
gcolors=c("#E41A1C","#FF7F00","#4DAF4A","dodgerblue","grey")



genome=paste0("../data/ptetraurelia_mac_51_with_ies.fa")
basegenome = sub(".fa","",sub(".fasta","",basename(genome)))
ref_pref="pt_51_with_ies"

label_colors= colors
data.frame(PREFIX=prefixes,LABEL=labels)


internal_ies_basename="internal_IES.pt_51_with_ies"
internal_IES=read.table(paste0("../IES_MIC-limited/internal_IES/",internal_ies_basename,".tsv"),h=T,sep="\t",stringsAsFactor=F)

encompassing_ies_ids=unique(internal_IES$IES_ID)

revCompSeq<-function(x) {
    paste(rev(comp(s2c(x[2]))),collapse="")
}

ies=read.table("../data/internal_eliminated_sequence_PGM_IES51_features.tab",h=T,sep="\t",stringsAsFactor=F)
rownames(ies)=ies$ID

library(seqinr)
library(stringr)
boundaries=data.frame(ID=ies$ID,LENGTH=ies$SIZE,MAC_FLANK_SEQ=str_extract(ies$MAC_FLANK_SEQ,c("\\w{5}TA\\w{5}")),LEFT=substr(ies$IES_SEQ,0,13),RIGHT=apply(ies[,c("ID","IES_SEQ")],1,get_right_boundary_seq,add_TA=FALSE,boundary_length=13))

boundaries$LEFT_TATAG = substr(boundaries$LEFT,1,5)=="TATAG"
boundaries$RIGHT_TATAG = substr(boundaries$RIGHT,1,5)=="TATAG"
boundaries$TATAG = apply(boundaries[,c("LEFT_TATAG","RIGHT_TATAG")],1,all)

boundaries$LEFT_TACAG = substr(boundaries$LEFT,1,5)=="TACAG"
boundaries$RIGHT_TACAG = substr(boundaries$RIGHT,1,5)=="TACAG"
boundaries$TACAG = apply(boundaries[,c("LEFT_TACAG","RIGHT_TACAG")],1,all)

rownames(boundaries)=boundaries$ID


# number of mapped reads
mapping_base_dir=paste0("../data/mapping/",basegenome,"/DNAseq/")
nb_mapped_reads=c()
for(p in 1:length(prefixes)) {
    prefix=prefixes[p]
        
        log_file=try(system(paste0("find ",mapping_base_dir," -name ",prefixes[p],".rmdup.BOWTIE.",ref_pref,".log"),intern=T))
        nb_read=as.numeric(try(system(paste0("head ",log_file," | grep 'NB_READS_RMDUP=' | perl -p -e 's/NB_READS_RMDUP=//'"),intern=T)))

    nb_mapped_reads=c(nb_mapped_reads,nb_read)
    print(paste(prefix,log_file))
    nb_read=0
    
}
nb_mapped_reads





data=list()
for(prefix in prefixes) {
    dir=try(system(paste0("find ../data/ParTIES/ -name ",prefix, ".rmdup | head -1"),intern=T))
   file=paste0(dir,"/ParTIES/Compare/Compare.current.tab")
   print(file)
   if(!file.exists(file)) {
      print(paste("No file",file))
   } else {
      tab=read.delim(file,h=T,sep="\t",stringsAsFactors=F)

      tab=tab[!tab$IDENTICAL & tab$BOUNDARY_RELATION!='identical',]   
      tab=tab[!is.na(tab$BOUNDARY_RELATION),]
      tab=tab[!tab$IS_OVERLAPPING,]
      
      data[[prefix]] = tab
   }   
} 


# Number of errors normalized
results=c()
for(f in 1:length(prefixes)) {
   prefix=prefixes[f]
   tab = data[[prefix]]
   results=c(results,c(nrow(tab) / (nb_mapped_reads[f] /1e6)))
   #results=c(results,c(sum(tab$SUPPORT_VARIANT) / (nb_mapped_reads[f] /1e6)))
   
   print(paste(sep=" ",prefix,labels[f],nb_mapped_reads[f],round(nb_mapped_reads[f]/1e6,0)))

}
names(results)=labels


pdf(paste0(img_dir,"nb_errors_normalized.pdf"),width=10)
par(mar=c(12.1, 4.1, 4.1, 4.1), xpd=TRUE)
barplot(results,names.arg=labels,col="grey",las=2,ylab="Number of non-redundant errors (per M)")
dev.off()
write.table(results,"nb_errors_normalized.tab",sep="\t",quote=F,col.names=F)




#######################################
# excision error during autogamy time course
#######################################
excision_errors=list()
for(f in 1:length(prefixes)) {
   prefix=prefixes[f]
   tab = data[[prefix]]
   tab = tab[tab$ERROR_TYPE=="excision error",]
   print(paste(nrow(tab),nrow(tab[tab$SIZE>25,]), round(nrow(tab[tab$SIZE>25,])/nrow(tab)*100,1)))
   tab = tab[tab$SIZE>25,]
   tab = tab[tab$BOUNDARY_RELATION!="no_overlap",]
   
   
   tab$ERROR_CATEGORY = tab$BOUNDARY_RELATION
   #tab[grep("_IES",tab$BOUNDARY_RELATION),]$ERROR_CATEGORY = "ligation_with_IES"
   tab[which(is.element(tab$BOUNDARY_RELATION,c("left_internal","right_internal"))),]$ERROR_CATEGORY="PARTIAL_INTERNAL"
   tab[which(is.element(tab$BOUNDARY_RELATION,c("left_external","right_external"))),]$ERROR_CATEGORY="PARTIAL_EXTERNAL"
   tab[which(is.element(tab$BOUNDARY_RELATION,c("only_overlap"))),]$ERROR_CATEGORY="OVERLAP"
   #tab[which(is.element(tab$BOUNDARY_RELATION,c("no_overlap"))),]$ERROR_CATEGORY="CRYPTIC"
   tab[which(is.element(tab$BOUNDARY_RELATION,c("internal"))),]$ERROR_CATEGORY="INTERNAL"
   tab[which(is.element(tab$BOUNDARY_RELATION,c("external"))),]$ERROR_CATEGORY="EXTERNAL"
   
   excision_errors[[prefix]]=tab
}




# IES excision error categories
results=data.frame()
nb_events=c()
for(f in 1:length(prefixes)) {
   prefix=prefixes[f]
   tab = excision_errors[[prefix]]
   
   nb_events=c(nb_events,nrow(tab))
   res= as.data.frame(table(tab$ERROR_CATEGORY) / (nb_mapped_reads[f] /1e6))
   colnames(res)=c("ERROR_CATEGORY",prefix)
   
   
   
   if(dim(results)[1]==0) {
      results=res
   } else {
      results=merge(results,res,by="ERROR_CATEGORY",all=T)
   }
}
rownames(results)=results$ERROR_CATEGORY
results=results[,-1]

results[is.na(results)]=0

error_category_colors=brewer.pal(nrow(results),"Dark2")
names(error_category_colors)=c("INTERNAL","PARTIAL_INTERNAL", "OVERLAP","EXTERNAL","PARTIAL_EXTERNAL")


pdf(paste0(img_dir,"excision_error.pdf"),width=12)
par(mfrow=c(1,2),mar=c(8.1, 4.1, 4.1, 2.1))
barplot(as.matrix(results),ylim=c(0,max(apply(results,2,sum))*1.1),names.arg=labels,col=error_category_colors[rownames(results)],las=2,ylab="# of IES excision errors (per M)",main="")
legend("top",legend=tolower(rownames(results)),col=error_category_colors[rownames(results)],pch=15,ncol=2,bty="n")
barplot(t(t(results)/rowSums(t(results))),names.arg=labels,col=error_category_colors[rownames(results)],las=2,ylab="Proportion of IES excision error types",main="")
dev.off()


# Internal IESs
internal_ies_basename="internal_IES.pt_51_with_ies"
internal_IES=read.table(paste0("../IES_MIC-limited/internal_IES/",internal_ies_basename,".tsv"),h=T,sep="\t",stringsAsFactor=F)
encompassing_ies_ids=unique(internal_IES$IES_ID)

# create several groups of IES according their locations
ies_groups=list(
"All"=setdiff(groups[groups$GROUP_NAME!="None",]$ID,encompassing_ies_ids)
)
ies_groups[["InCDS"]]=intersect(ies[ies$IN_CDS,]$ID,ies_groups[["All"]])
ies_groups[["InIntergenic"]]=intersect(ies[!ies$IN_GENE,]$ID,ies_groups[["All"]])

ies_sizes=c(25,34,42,141,max(ies$SIZE))
for(s in 2:length(ies_sizes)) {
    ies_groups[[paste0(ies_sizes[s-1],"-",ies_sizes[s]-1,"_nt")]] = intersect(ies[ies$SIZE >=ies_sizes[s-1] & ies$SIZE < ies_sizes[s],]$ID,ies_groups[["All"]])
     
}

# EZL-dep
retention = get_MIRET_score(paste(sep="","../data/ParTIES/","Ezl174-1_RNAi_r1_r2//ParTIES/MIRET/MIRET.tab"),all=T)
cols=colnames(retention)[-1]
colnames(retention)[(length(colnames(retention))- length(cols) + 1):length(colnames(retention))]=paste(cols,"_EZL1_r1",sep="")
retention = merge(retention,get_MIRET_score(paste(sep="","../data/ParTIES/","Ezl174-2_RNAi_r1///ParTIES/MIRET/MIRET.tab"),all=T),by="ID")
colnames(retention)[(length(colnames(retention))- length(cols) + 1):length(colnames(retention))]=paste(cols,"_EZL1_r2",sep="")
retention$SIGNIFICANT_EZL1 =retention$SIGNIFICANT_EZL1_r1 | retention$SIGNIFICANT_EZL1_r2

# TFIIS4
retention = merge(retention,get_MIRET_score(paste(sep="","../data/ParTIES/","TFIIS4/ParTIES/MIRET/MIRET.tab"),all=T),by="ID")
colnames(retention)[(length(colnames(retention))- length(cols) + 1):length(colnames(retention))]=paste(cols,"_TFIIS4",sep="")


# DCL2/3-dep
retention = merge(retention,get_MIRET_score(paste(sep="","../data/ParTIES/","DCL2_3_RNAi_r1_HBJ-1/ParTIES/MIRET/MIRET.tab"),all=T),by="ID")
colnames(retention)[(length(colnames(retention))- length(cols) + 1):length(colnames(retention))]=paste(cols,"_DCL23_r1",sep="")

retention = merge(retention,get_MIRET_score(paste(sep="","../data/ParTIES/","Dcl2-3_RNAi_r2/ParTIES/MIRET/MIRET.tab"),all=T),by="ID")
colnames(retention)[(length(colnames(retention))- length(cols) + 1):length(colnames(retention))]=paste(cols,"_DCL23_r2",sep="")

retention$SIGNIFICANT_DCL23 =retention$SIGNIFICANT_DCL23_r1 | retention$SIGNIFICANT_DCL23_r2
retention$RETENTION_SCORE_DCL23 =apply(retention[,paste0("RETENTION_SCORE_",c("DCL23_r1","DCL23_r2"))],1,mean)

# DCL5
retention = merge(retention,get_MIRET_score(paste(sep="","../data/ParTIES/","DCL5_RNAi_r1_HBJ-2//ParTIES/MIRET/MIRET.tab"),all=T),by="ID")
colnames(retention)[(length(colnames(retention))- length(cols) + 1):length(colnames(retention))]=paste(cols,"_DCL5",sep="")


retention$SIGNIFICANT_ONLY_EXC_CPLX = rowSums(retention[,paste0("SIGNIFICANT_",c("EZL1","TFIIS4","DCL23","DCL5"))])==0

for(dep in c("EZL1","TFIIS4","DCL23","DCL5","ONLY_EXC_CPLX")) {
    ies_groups[[dep]]=intersect(retention[retention[,paste0("SIGNIFICANT_",dep)],]$ID,ies_groups[["All"]])
}





# for the last time points
for(p in grep("FRAG|DEV4",labels)) {
    prefix=prefixes[p]
    label=labels[p]
    
           
    # between ies groups
    results=c()
    for(ies_group in c("ONLY_EXC_CPLX","EZL1","TFIIS4","DCL23","DCL5")) {
        tab = excision_errors[[prefix]]
        tab = tab[which(is.element(tab$ERROR_CATEGORY,c("OVERLAP","EXTERNAL","PARTIAL_EXTERNAL"))),]
        tab = tab[which(is.element(tab$CLOSEST_FEATURE_ID,ies_groups[[ies_group]])),]
        
        
        pvalue=NA
        if(ies_group !="ONLY_EXC_CPLX") {
            pvalue=chisq.test(rbind(results[1,1:2], c(nrow(tab),length(ies_groups[[ies_group]]))))$p.value
        }
        results=rbind(results,c(nrow(tab),length(ies_groups[[ies_group]]),pvalue))
        
    }
    results=as.data.frame(results)
    colnames(results)=c("NB_ERROR","NB_IES","PVALUE")
    rownames(results)=c("ONLY_EXC_CPLX","EZL1","TFIIS4","DCL23","DCL5")

    results  
#~                NB_ERROR NB_IES       PVALUE
#~ ONLY_EXC_CPLX      469  12392           NA
#~ EZL1              1849  31032 5.556736e-18
#~ TFIIS4            1113  20153 1.689176e-11
#~ DCL23              332   3245 2.491366e-43
#~ DCL5               150   2346 5.410399e-08
    
    pdf(paste0(img_dir,"excision_errors_for_each_DepGroups_for_",gsub(" ","_",label),".pdf"))
    par(mar=c(6.1, 4.1, 4.1, 2.1),xpd=T)
    bp<-barplot(results$NB_ERROR/results$NB_IES,names.arg=rownames(results),ylab="# of IES excision errors (per IES)",main=label,col="darkgrey",las=2)
    text(bp,results$NB_ERROR/results$NB_IES+0.003,paste0("N=",unlist(lapply(ies_groups,length)[rownames(results)])))
    dev.off()
    

   
      
    
    
    # for each timing groups
    for(ies_group in names(ies_groups)) {
        cur_img_dir=paste0(img_dir,"/",ies_group,"/")
        dir.create(cur_img_dir,showWarnings=F,  recursive=T)
        
        # normalized number of errors in each IES group categories
        # only considering certain types of errors : "OVERLAP","EXTERNAL","PARTIAL_EXTERNAL"
        results=data.frame()
        nb_ies_ids=c()

        for(gname in group_names[group_names!="None"]) {
            current_ies_ids=intersect(groups[groups$GROUP_NAME==gname,]$ID,ies_groups[[ies_group]])
                
            tab = excision_errors[[prefix]]
            tab = tab[which(is.element(tab$ERROR_CATEGORY,c("OVERLAP","EXTERNAL","PARTIAL_EXTERNAL"))),]
            tab = tab[which(is.element(tab$CLOSEST_FEATURE_ID,current_ies_ids)),]
            
            if(nrow(tab) >0) {
                res= as.data.frame(table(tab$ERROR_CATEGORY) /length(current_ies_ids))
            } else {
                res=data.frame(c("OVERLAP", "EXTERNAL", "PARTIAL_EXTERNAL"),rep(0,3))
            }
            colnames(res)=c("ERROR_CATEGORY",prefix)
            nb_ies_ids=c(nb_ies_ids,length(current_ies_ids))

           

            if(dim(results)[1]==0) {
                results=res
            } else {
                results=merge(results,res,by="ERROR_CATEGORY",all=T)
            }    
            colnames(results)[ncol(results)]=gname
        }
        rownames(results)=results$ERROR_CATEGORY
        results=results[,-1]

        results[is.na(results)]=0



        pdf(paste0(cur_img_dir,"excision_errors_by_TimingGroups_for_",gsub(" ","_",label),".pdf"))
        par(mar=c(6.1, 4.1, 4.1, 2.1),xpd=T)
        bp<-barplot(as.matrix(results),col=error_category_colors[rownames(results)],las=2,ylab="# of IES excision errors (per IES)",main=paste(label,ies_group))
        text(bp,apply(as.matrix(results),2,sum)+0.01,paste0("N=",nb_ies_ids),srt=45)
        legend("topleft",legend=tolower(rownames(results)),col=error_category_colors[rownames(results)],pch=15,ncol=2,bty="n")
        dev.off()
        
        pdf(paste0(cur_img_dir,"excision_errors_by_TimingGroups_no_details_for_",gsub(" ","_",label),".pdf"))
        par(mar=c(6.1, 4.1, 4.1, 2.1),xpd=T)
        bp<-barplot(apply(results,2,sum),col=gcolors,ylab="# of IES excision errors (per IES)",main=paste(label,ies_group),border="white")
        text(bp,apply(as.matrix(results),2,sum)+0.01,paste0("N=",nb_ies_ids))
        dev.off()
        
        
        # effect of the boundary sequence ?
        
        results=data.frame()
        nb_ies_ids=data.frame()

        prefix=prefixes[p]
        label=labels[p]
        for(gname in group_names[group_names!="None"]) {
            current_ies_ids=intersect(groups[groups$GROUP_NAME==gname,]$ID,ies_groups[[ies_group]])
            tab = excision_errors[[prefix]]
            tab = tab[which(is.element(tab$ERROR_CATEGORY,c("OVERLAP","EXTERNAL","PARTIAL_EXTERNAL"))),]
            tab = tab[which(is.element(tab$CLOSEST_FEATURE_ID,current_ies_ids)),]
            
            gr=list("TA"=current_ies_ids, "TATAG"= intersect(current_ies_ids,boundaries[boundaries$TATAG,]$ID), "TACAG"= intersect(current_ies_ids,boundaries[boundaries$TACAG,]$ID))
            res=c()
            for(g in names(gr)) {
                tab_g=tab[which(is.element(tab$CLOSEST_FEATURE_ID,gr[[g]])),]
                res= c(res,nrow(tab_g) /length(gr[[g]]))
            }
            results=rbind(results,res)
            colnames(results)=names(gr)
            
            nb_ies_ids=rbind(nb_ies_ids,as.vector(unlist(lapply(gr,length))))
            colnames(nb_ies_ids)=names(gr)
        }
        rownames(results)=group_names[group_names!="None"]
        rownames(nb_ies_ids)=group_names[group_names!="None"]
    
        boundaries_colors=brewer.pal(ncol(results),"Accent")
        
        
        pdf(paste0(cur_img_dir,"excision_errors_by_TimingGroupsConsensus_and_SeqBoundaries_",gsub(" ","_",label),".pdf"))
        par(xpd=T)
        bp<-barplot(t(results),beside=T,col=boundaries_colors,main=paste(label,ies_group))
        text(bp,t(results)*1.1,paste0("n=",t(nb_ies_ids)),srt=45)
        legend("topleft",legend=colnames(results),col=boundaries_colors,pch=15,bty="n")
        dev.off()
        
        
    }
}






