  
source("../headers.R")
source("../functions.R")

source("../IES_Excision_Score/load_retention.R")

library(seqinr)
library(stringr)
library(binom)
library(ggseqlogo)
base_img_dir=paste0("images/")
base_result_dir=paste0("results/")

dir.create(base_img_dir,showWarnings=F,  recursive=T)
dir.create(base_result_dir,showWarnings=F,  recursive=T)




seqlength=read.table("../data/ptetraurelia_mac_51.seqlength",h=F)
colnames(seqlength)=c("SEQ_ID","SEQ_LENGTH")
seqlength=seqlength[order(seqlength$SEQ_LENGTH,decreasing=T),]
seqlength$REVERSE=NA
rownames(seqlength)=seqlength$SEQ_ID

ies=read.table("/data/PARAMECIUM/GENOMIC/tetraurelia/micronucleus/IES/51/internal_eliminated_sequence_PGM_IES51_features.tab",h=T,sep="\t",stringsAsFactor=F)
rownames(ies)=ies$ID

ies$FLOATING_IES = FALSE
ies[grep("taTA|TAta",ies$MAC_FLANK_SEQ),]$FLOATING_IES=TRUE

ies$CINDEX= apply(ies[,c("ID","IES_SEQ")],1,cindex,add_TA=FALSE)


groups=read.table("../IES_Excision_Score/IES_Groups.tsv",h=T,sep="\t")
group_names=c("Very early","Early","Intermediate","Late"
#,"None"
)
gcolors=c("#E41A1C","#FF7F00","#4DAF4A","dodgerblue"
#,"grey"
)

names(gcolors)=group_names

#hist(ies$SIZE,breaks=seq(0,max(ies$SIZE)+1,1),col="red",xlim=c(0,200),ylim=c(0,500))

peaks = c(25,34, 42, 52, 63, 72, 82, 91, 102,111,122,132,141,max(ies$SIZE))



ies_sensitivity_classes=list(ALL=groups[groups$GROUP_NAME!="None",]$ID)
for(cl in c("EZL1","TFIIS4","DCL23","DCL5","ONLY_EXC_CPLX","PGM")) {
  ies_sensitivity_classes[[cl]]=  intersect(as.vector(retention[retention[,paste0("SIGNIFICANT_",cl)] ,]$ID),ies_sensitivity_classes[["ALL"]])
}

ies_sensitivity_classes[["EZL1_ONLY"]]= setdiff(ies_sensitivity_classes[["EZL1"]],unique(c(ies_sensitivity_classes[["TFIIS4"]],ies_sensitivity_classes[["DCL23"]],ies_sensitivity_classes[["DCL5"]])))


size_groups=list()
for(s in 2:length(peaks)) {
    size_min=peaks[s-1]
    size_max=peaks[s]
    size_groups[[paste0(size_min,"-",size_max-1,"_nt")]]=c(size_min,size_max)
}
#size_groups[["25-33_nt"]]=c(25,34)
size_groups[["42-140_nt"]]=c(42,141)
#size_groups[["141-MAX_nt"]]=c(141,max(peaks))


img_dir=paste0(base_img_dir)
dir.create(img_dir,showWarnings=F,  recursive=T)

# consensus

seq_groups=list()


size_group_colors=brewer.pal(length(names(size_groups)),"Set2")
for(dep in names(ies_sensitivity_classes)) {
    dep_ies_ids= ies_sensitivity_classes[[dep]]
    
    seq_groups[[dep]][["sizes"]][["timings"]]=dep_ies_ids
    for(timing_gname in group_names[group_names!="None"]) {
        dep_timing_ies_ids = intersect(dep_ies_ids,  groups[groups$GROUP_NAME==timing_gname,]$ID)
        seq_groups[[dep]][["sizes"]][[timing_gname]]=dep_timing_ies_ids
    }
    
    for(size_gname in names(size_groups)) {
        size_min=size_groups[[size_gname]][1]
        size_max=size_groups[[size_gname]][2]
        dep_size_ies_ids = intersect(dep_ies_ids, ies[ies$SIZE >=size_min & ies$SIZE < size_max,]$ID)
        
        seq_groups[[dep]][[size_gname]][["timings"]]=dep_size_ies_ids
        
        for(timing_gname in group_names[group_names!="None"]) {
            dep_size_timing_ies_ids = intersect(dep_size_ies_ids,  groups[groups$GROUP_NAME==timing_gname,]$ID)
            seq_groups[[dep]][[size_gname]][[timing_gname]]=dep_size_timing_ies_ids
        }
        
    }
    
}


boundaries=data.frame(ID=ies$ID,LENGTH=ies$SIZE,MAC_FLANK_SEQ=str_extract(ies$MAC_FLANK_SEQ,c("\\w{5}TA\\w{5}")),LEFT=substr(ies$IES_SEQ,0,13),RIGHT=apply(ies[,c("ID","IES_SEQ")],1,get_right_boundary_seq,add_TA=FALSE,boundary_length=13))
boundaries$LEFT_WITH_JUNC=paste(sub("TA","",str_extract(boundaries$MAC_FLANK_SEQ,c("\\w{5}TA"))), boundaries$LEFT,sep="")

boundaries$RIGHT_WITH_JUNC=paste(apply(data.frame(ID=boundaries$ID,MAC_FLANK_SEQ=sub("TA","",str_extract(boundaries$MAC_FLANK_SEQ,c("TA\\w{5}")))),1,revCompSeq),boundaries$RIGHT,sep="")
rownames(boundaries)=boundaries$ID




weblogo_bases=c("A","C","G","T")
weglogo="/usr/local/src/anaconda/anaconda3/bin/weblogo"

# SEQLOGO
##################

#~ for(dep in names(seq_groups)) {
#for(dep in c("ALL","ONLY_EXC_CPLX","TFIIS4")) {
for(dep in c("ALL","EZL1","TFIIS4")) {
    for(size_gname in names(seq_groups[[dep]])) {
        for(timing_gname in names(seq_groups[[dep]][[size_gname]])) {
            
            
            cur_img_dir=gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/",timing_gname,"/"))
            dir.create(cur_img_dir,showWarnings=F,  recursive=T)
            current_ies_ids=seq_groups[[dep]][[size_gname]][[timing_gname]]
            print(paste(dep,size_gname,timing_gname,length(current_ies_ids)))
            if(length(current_ies_ids) > 2) {


                out_name = gsub(" ","_",paste(dep,size_gname,timing_gname,"BOTH","13nt",sep="_"))
                write.fasta(as.list(c(boundaries[current_ies_ids,"LEFT"],boundaries[current_ies_ids,"RIGHT"])), c(paste0(current_ies_ids,"_LEFT"),paste0(current_ies_ids,"_RIGHT")), paste0(cur_img_dir,"IES_boundary_seq.fa"), open = "w", nbchar = 60, as.string = FALSE)
                web_logo_title=substr(paste0("N=",length(current_ies_ids)*2," ",gsub("_"," ",out_name)),1,25)
                   
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format PDF > ",cur_img_dir,out_name,"_weblogo_bits.pdf"))
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format PNG > ",cur_img_dir,out_name,"_weblogo_bits.png"))
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units probability --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format PNG > ",cur_img_dir,out_name,"_weblogo_prop.png"))
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format logodata > ",cur_img_dir,out_name,"_weblogo_bits.tsv"))
                unlink(paste0(cur_img_dir,"IES_boundary_seq.fa"))
                
                
                out_name = gsub(" ","_",paste(dep,size_gname,timing_gname,"BOTH","8nt",sep="_"))
                write.fasta(as.list(substr(c(boundaries[current_ies_ids,"LEFT"],boundaries[current_ies_ids,"RIGHT"]),1,8)), c(paste0(current_ies_ids,"_LEFT"),paste0(current_ies_ids,"_RIGHT")), paste0(cur_img_dir,"IES_boundary_seq.fa"), open = "w", nbchar = 60, as.string = FALSE)
                web_logo_title=substr(paste0("N=",length(current_ies_ids)*2," ",gsub("_"," ",out_name)),1,15)
                   
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28   --resolution 300 --title '",web_logo_title,"' --format PDF > ",cur_img_dir,out_name,"_weblogo_bits.pdf"))
#~                 system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format PNG > ",cur_img_dir,out_name,"_weblogo_bits.png"))
#~                 system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units probability --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format PNG > ",cur_img_dir,out_name,"_weblogo_prop.png"))
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format logodata > ",cur_img_dir,out_name,"_weblogo_bits.tsv"))
                unlink(paste0(cur_img_dir,"IES_boundary_seq.fa"))
                
                
                out_name = gsub(" ","_",paste(dep,size_gname,timing_gname,"BOTH","JUNC","8nt",sep="_"))
                write.fasta(as.list(substr(c(boundaries[current_ies_ids,"LEFT_WITH_JUNC"],boundaries[current_ies_ids,"RIGHT_WITH_JUNC"]),1,8+5)), c(paste0(current_ies_ids,"_LEFT"),paste0(current_ies_ids,"_RIGHT")), paste0(cur_img_dir,"IES_boundary_seq.fa"), open = "w", nbchar = 60, as.string = FALSE)
                web_logo_title=substr(paste0("N=",length(current_ies_ids)*2," ",gsub("_"," ",out_name)),1,15)
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28    --resolution 300 --title '",web_logo_title,"' --format PDF > ",cur_img_dir,out_name,"_weblogo_bits.pdf"))
#~                 system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format PNG > ",cur_img_dir,out_name,"_weblogo_bits.png"))
#~                 system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units probability --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format PNG > ",cur_img_dir,out_name,"_weblogo_prop.png"))
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format logodata > ",cur_img_dir,out_name,"_weblogo_bits.tsv"))
                unlink(paste0(cur_img_dir,"IES_boundary_seq.fa"))
                
                
                
                
                
            }
            
        }
    }
}



# STATISTCAL TEST DIFF BASE
##########################


diffLogo<-function(m,title="") {

    if(nrow(m) > 0) {
        
        
        pmat =t(m[,paste0("DELTA_FREQ_",weblogo_bases)])
        rownames(pmat)=weblogo_bases
        pmat[is.na(pmat)]=0               
        if(sum(pmat!=0) > 0) {
            gp=ggseqlogo(pmat, method='custom', seq_type='dna') + ylab("delta frequencies")+ geom_hline(yintercept=0)+ ggtitle(title)
        } else {
            gp= ggplot() + theme_void()+ ggtitle("No signficant results")
        }
    } else {
        print(paste("File do not exist ",ctl_file," or ",cur_file))
        gp= ggplot() + theme_void()+ ggtitle("No signficant results")
    }
    gp
}

getFreqMat<-function(current_file=cur_file,control_file=ctl_file, pvalue_cutoff=0.001,weblogo_bases=c("A","C","G","T")) {
    m=data.frame()
    if(file.exists(ctl_file) & file.exists(cur_file)) {
        
        ctl_m=read.table(ctl_file,sep="\t")[,-1]
        colnames(ctl_m)=c("A","C","G","T","Entropy","Low","High","Weight")
        m=read.table(cur_file,sep="\t")[,-1]
        colnames(m)=c("A","C","G","T","Entropy","Low","High","Weight")    

                            
        for(base in weblogo_bases) {
            ctl_pupper= binom.confint(ctl_m[,base], rowSums(ctl_m[,weblogo_bases]), methods = "exact",  conf.level = 0.95)$upper
            m[,paste0("FREQ_CTL_",base)] = ctl_m[,base]/ rowSums(ctl_m[,weblogo_bases])
            m[,paste0("FREQ_",base)] = m[,base]/ rowSums(m[,weblogo_bases])
            pvalues=p.adjust(apply(cbind(m[,base],rowSums(m[,weblogo_bases]),ctl_pupper),1,function(x) { binom.test(x=x[1], n=x[2], p =x[3], alternative="two.sided")$p.value }),method="bonferroni")
            m[,paste0("PVALUE_",base)] = pvalues

            m[,paste0("DELTA_FREQ_",base)] = ifelse( pvalues<pvalue_cutoff, m[,paste0("FREQ_",base)]-m[,paste0("FREQ_CTL_",base)],NA)

            
            #m[,paste0("PVALUE_",base)]=p.adjust(apply(cbind(m[,base],rowSums(m[,weblogo_bases]),ctl_pupper),1,function(x) { binom.test(x=x[1], n=x[2], p =x[3], alternative="two.sided")$p.value }),method="bonferroni")
            #m[,paste0("PVALUE_LESS_",base)]=p.adjust(apply(cbind(m[,base],rowSums(m[,weblogo_bases]),ctl_pupper),1,function(x) { binom.test(x=x[1], n=x[2], p =x[3], alternative="less")$p.value }),method="bonferroni")
        
        }
    }
    m
}                
                
    


#~ for(dep in names(seq_groups)) {
for(dep in c("ALL","EZL1","TFIIS4")) {
    for(size_gname in names(seq_groups[[dep]])) {
        #for(timing_gname in names(seq_groups[[dep]][[size_gname]])) {
        for(timing_gname in c("Very early","Late" )) {
            current_ies_ids=seq_groups[[dep]][[size_gname]][[timing_gname]]
            if(length(current_ies_ids) > 2 & timing_gname!="timings") {    
                         
                cur_img_dir=gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/",timing_gname,"/"))
                dir.create(cur_img_dir,showWarnings=F,  recursive=T)
                
                #for(boundary_side in c("LEFT","RIGHT","BOTH")) {
                for(boundary_side in c("BOTH")) {
                    cur_file = gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/",timing_gname,"/",gsub(" ","_",paste(dep,size_gname,timing_gname,boundary_side,"13nt",sep="_")),"_weblogo_bits.tsv"))
                    nb_seq_cur=sum(read.table(cur_file,sep="\t")[1,2:5])

                    for(ctl_timing_gname in c("timings","Very early","Late" )) {
                        if(ctl_timing_gname != timing_gname) {
                            print(paste0(dep,size_gname ,timing_gname," VS ",ctl_timing_gname))
                            
                            ctl_file = gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/",ctl_timing_gname,"/",gsub(" ","_",paste(dep,size_gname,ctl_timing_gname,boundary_side,"13nt",sep="_")),"_weblogo_bits.tsv"))
                            nb_seq_ctl=sum(read.table(ctl_file,sep="\t")[1,2:5])

                            freq_mat= getFreqMat(cur_file,ctl_file)
                            write.table(freq_mat,gsub(" ","_",paste0(cur_img_dir,"diffLogo_",timing_gname,"_VS_",ctl_timing_gname,"_",boundary_side,".tsv")),sep="\t",quote=F)
                            
                            gp <- diffLogo(freq_mat,title=paste0(timing_gname," VS ",ctl_timing_gname,"\nN=",nb_seq_cur," and N=",nb_seq_ctl))
                            #ggsave(gsub(" ","_",paste0(cur_img_dir,"diffLogo_",timing_gname,"_VS_",ctl_timing_gname,"_",boundary_side,".pdf")), plot = gp)    
                            ggsave(gsub(" ","_",paste0(cur_img_dir,"diffLogo_",timing_gname,"_VS_",ctl_timing_gname,"_",boundary_side,".png")), plot = gp)    
                            
                            
                        }
                    }
                    
                    
                }
            }
        }
    }
}

get_stars <- function(x) {
    stars=""
    pv=as.numeric(x["PVALUE"])
    if(pv <= 1e-50) {
        stars="*"
    } 
    if(pv <= 1e-100) {
        stars="**"
    }
    if(pv <= 1e-300) {
        stars="***"    
    }
    if(pv <= 1e-400) {
        stars="****"  
    }
    stars
}



res1=c()

dep="ALL"
timing_gname="Very early"
boundary_side="BOTH"

size_gname="25-33_nt"
cur_bases=c("T","A","G")

versus="timings"
cur_img_dir=gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/",timing_gname,"/"))
freq_mat = read.table(gsub(" ","_",paste0(cur_img_dir,"diffLogo_",timing_gname,"_VS_",versus,"_",boundary_side,".tsv")),sep="\t")
freq_mat[3:5,]

res=cbind(
BASE=cur_bases,
COMPARISON=paste0(size_gname," ",timing_gname,"_VS_",versus),
FREQ_CTL=round(c(freq_mat[3,paste0("FREQ_CTL_",cur_bases[1])],
freq_mat[4,paste0("FREQ_CTL_",cur_bases[2])],
freq_mat[5,paste0("FREQ_CTL_",cur_bases[3])])*100,0),

FREQ=round(c(freq_mat[3,paste0("FREQ_",cur_bases[1])],
freq_mat[4,paste0("FREQ_",cur_bases[2])],
freq_mat[5,paste0("FREQ_",cur_bases[3])])*100,0),

DELTA_FREQ=round(c(freq_mat[3,paste0("DELTA_FREQ_",cur_bases[1])],
freq_mat[4,paste0("DELTA_FREQ_",cur_bases[2])],
freq_mat[5,paste0("DELTA_FREQ_",cur_bases[3])])*100,0),

PVALUE=c(freq_mat[3,paste0("PVALUE_",cur_bases[1])],
freq_mat[4,paste0("PVALUE_",cur_bases[2])],
freq_mat[5,paste0("PVALUE_",cur_bases[3])])
)
res1=cbind(res1,cbind(res,STARs=apply(res,1,get_stars)))


versus="Late"
cur_img_dir=gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/",timing_gname,"/"))
freq_mat = read.table(gsub(" ","_",paste0(cur_img_dir,"diffLogo_",timing_gname,"_VS_",versus,"_",boundary_side,".tsv")),sep="\t")
freq_mat[3:5,]

res=cbind(
BASE=cur_bases,
COMPARISON=paste0(size_gname," ",timing_gname,"_VS_",versus),
FREQ_CTL=round(c(freq_mat[3,paste0("FREQ_CTL_",cur_bases[1])],
freq_mat[4,paste0("FREQ_CTL_",cur_bases[2])],
freq_mat[5,paste0("FREQ_CTL_",cur_bases[3])])*100,0),

FREQ=round(c(freq_mat[3,paste0("FREQ_",cur_bases[1])],
freq_mat[4,paste0("FREQ_",cur_bases[2])],
freq_mat[5,paste0("FREQ_",cur_bases[3])])*100,0),

DELTA_FREQ=round(c(freq_mat[3,paste0("DELTA_FREQ_",cur_bases[1])],
freq_mat[4,paste0("DELTA_FREQ_",cur_bases[2])],
freq_mat[5,paste0("DELTA_FREQ_",cur_bases[3])])*100,0),

PVALUE=c(freq_mat[3,paste0("PVALUE_",cur_bases[1])],
freq_mat[4,paste0("PVALUE_",cur_bases[2])],
freq_mat[5,paste0("PVALUE_",cur_bases[3])])
)
res1=cbind(res1,cbind(res,STARs=apply(res,1,get_stars)))




res2=c()

size_gname="42-140_nt"
cur_bases=c("C","A","G")

versus="timings"
cur_img_dir=gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/",timing_gname,"/"))
freq_mat = read.table(gsub(" ","_",paste0(cur_img_dir,"diffLogo_",timing_gname,"_VS_",versus,"_",boundary_side,".tsv")),sep="\t")
freq_mat[3:5,]
res=cbind(
BASE=cur_bases,
COMPARISON=paste0(size_gname," ",timing_gname,"_VS_",versus),
FREQ_CTL=round(c(freq_mat[3,paste0("FREQ_CTL_",cur_bases[1])],
freq_mat[4,paste0("FREQ_CTL_",cur_bases[2])],
freq_mat[5,paste0("FREQ_CTL_",cur_bases[3])])*100,0),

FREQ=round(c(freq_mat[3,paste0("FREQ_",cur_bases[1])],
freq_mat[4,paste0("FREQ_",cur_bases[2])],
freq_mat[5,paste0("FREQ_",cur_bases[3])])*100,0),

DELTA_FREQ=round(c(freq_mat[3,paste0("DELTA_FREQ_",cur_bases[1])],
freq_mat[4,paste0("DELTA_FREQ_",cur_bases[2])],
freq_mat[5,paste0("DELTA_FREQ_",cur_bases[3])])*100,0),

PVALUE=c(freq_mat[3,paste0("PVALUE_",cur_bases[1])],
freq_mat[4,paste0("PVALUE_",cur_bases[2])],
freq_mat[5,paste0("PVALUE_",cur_bases[3])])
)
res2=cbind(res2,cbind(res,STARs=apply(res,1,get_stars)))


versus="Late"
cur_img_dir=gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/",timing_gname,"/"))
freq_mat = read.table(gsub(" ","_",paste0(cur_img_dir,"diffLogo_",timing_gname,"_VS_",versus,"_",boundary_side,".tsv")),sep="\t")
freq_mat[3:5,]
res=cbind(
BASE=cur_bases,
COMPARISON=paste0(size_gname," ",timing_gname,"_VS_",versus),
FREQ_CTL=round(c(freq_mat[3,paste0("FREQ_CTL_",cur_bases[1])],
freq_mat[4,paste0("FREQ_CTL_",cur_bases[2])],
freq_mat[5,paste0("FREQ_CTL_",cur_bases[3])])*100,0),

FREQ=round(c(freq_mat[3,paste0("FREQ_",cur_bases[1])],
freq_mat[4,paste0("FREQ_",cur_bases[2])],
freq_mat[5,paste0("FREQ_",cur_bases[3])])*100,0),

DELTA_FREQ=round(c(freq_mat[3,paste0("DELTA_FREQ_",cur_bases[1])],
freq_mat[4,paste0("DELTA_FREQ_",cur_bases[2])],
freq_mat[5,paste0("DELTA_FREQ_",cur_bases[3])])*100,0),

PVALUE=c(freq_mat[3,paste0("PVALUE_",cur_bases[1])],
freq_mat[4,paste0("PVALUE_",cur_bases[2])],
freq_mat[5,paste0("PVALUE_",cur_bases[3])])
)
res2=cbind(res2,cbind(res,STARs=apply(res,1,get_stars)))



results=rbind(res1,res2)
results

 results[,c(1,3,4,10,7,14)]


# Left and right boundaries
#############################################

#for(dep in names(seq_groups)) {
for(dep in c("ALL")) {
    for(size_gname in names(seq_groups[[dep]])) {
        for(timing_gname in names(seq_groups[[dep]][[size_gname]])) {
            
            
            cur_img_dir=gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/",timing_gname,"/"))
            dir.create(cur_img_dir,showWarnings=F,  recursive=T)
            current_ies_ids=seq_groups[[dep]][[size_gname]][[timing_gname]]
            print(paste(dep,size_gname,timing_gname,length(current_ies_ids)))
            if(length(current_ies_ids) > 10) {
                current_boundaries=boundaries[current_ies_ids,]
                current_boundaries$LEFT_BOUNDARY=substr(current_boundaries[,"LEFT"],1,5)
                current_boundaries$RIGHT_BOUNDARY=substr(current_boundaries[,"RIGHT"],1,5)
                
                b=table(current_boundaries$LEFT_BOUNDARY)
                bleft=names(b[order(b,decreasing=T)])[1]
                
                b=table(current_boundaries$RIGHT_BOUNDARY)
                bnames=names(b[order(b,decreasing=T)][1:5])
                current_boundaries$SECOND_BOUNDARY = ifelse(is.element(current_boundaries$RIGHT_BOUNDARY,bnames),current_boundaries$RIGHT_BOUNDARY,"OTHER")
                bnames=c(bnames,"OTHER")
                bmat=t(as.matrix(table(current_boundaries[current_boundaries$LEFT_BOUNDARY==bleft,]$SECOND_BOUNDARY)[bnames]))
                rownames(bmat)=bleft
                colnames(bmat)=bnames
                bmat[is.na(bmat)]=0
                
                col_fun = colorRamp2(c(0,max(bmat,na.rm=T)), c( "white", "red"))
                hm_title=paste0("N=",length(current_ies_ids)," ",paste(dep,size_gname,timing_gname,sep=" "))
                pdf(gsub(" ","_",paste0(cur_img_dir,"Heatmap_TopBoundary_",size_gname,"_",timing_gname,".pdf")),height=2)
                print(Heatmap(bmat, name = "number", col = col_fun,
                show_column_dend = FALSE, show_row_dend = TRUE,
                    column_order = colnames(bmat),
                    row_names_side = "left", column_names_side = "top", 
                 column_title = paste(hm_title,"\nRight boundary"), row_title = "Left",
                    cell_fun = function(j, i, x, y, width, height, fill) {
                        grid.text( bmat[i, j], x, y, gp = gpar(fontsize = 10))
                }))
                dev.off() 
                
                b_cutoff=round(length(current_ies_ids)*0.05,0)
                b=table(c(current_boundaries$LEFT_BOUNDARY,current_boundaries$RIGHT_BOUNDARY))
                b=b[b>=b_cutoff]
                bnames=names(b[order(b,decreasing=T)])
                current_boundaries$FIRST_BOUNDARY = ifelse(is.element(current_boundaries$LEFT_BOUNDARY,bnames),current_boundaries$LEFT_BOUNDARY,"OTHER")
                current_boundaries$SECOND_BOUNDARY = ifelse(is.element(current_boundaries$RIGHT_BOUNDARY,bnames),current_boundaries$RIGHT_BOUNDARY,"OTHER")
                bnames=c(bnames,"OTHER")
                
                btab=as.matrix(table(current_boundaries[,c("FIRST_BOUNDARY","SECOND_BOUNDARY")]))
                
                bmat=matrix(0,nrow=length(bnames),ncol=length(bnames))
                colnames(bmat)=bnames
                rownames(bmat)=bnames
                
                for(i in 1:(length(bnames))) {
                    for(j in i:length(bnames)) { 
                        if(sum(is.element(bnames[i],rownames(btab)))==1 & sum(is.element(bnames[j],colnames(btab)))==1) {
                            bmat[bnames[i],bnames[j]]=bmat[bnames[i],bnames[j]]+btab[bnames[i],bnames[j]]
                        }
                        if(bnames[i]!=bnames[j]) {
                            if(sum(is.element(bnames[i],colnames(btab)))==1 & sum(is.element(bnames[j],rownames(btab)))==1) {
                                bmat[bnames[i],bnames[j]]=bmat[bnames[i],bnames[j]]+btab[bnames[j],bnames[i]]
                            }
                        }
                    }
                }
                
                col_fun = colorRamp2(c(0,max(bmat,na.rm=T)), c( "white", "red"))
                hm_title=paste0("N=",length(current_ies_ids)," ",paste(dep,size_gname,timing_gname,sep=" "))
                
#~                 print(Heatmap(bmat, name = "number", col = col_fun,
#~                 show_column_dend = FALSE, show_row_dend = FALSE,
#~                     column_order = colnames(bmat),row_order = rownames(bmat),
#~                     row_names_side = "left", column_names_side = "top", 
#~                  column_title = paste(hm_title,"\nSecond boundary"), row_title = "First",
#~                     cell_fun = function(j, i, x, y, width, height, fill) {
#~                         grid.text( ifelse(bmat[i, j]!=0,bmat[i, j],""), x, y, gp = gpar(fontsize = 10))
#~                 }))               
                
                pdf(gsub(" ","_",paste0(cur_img_dir,"Heatmap_Boundaries_CompIndex_",size_gname,"_",timing_gname,".pdf")))
                print(Heatmap(bmat, name = "number", col = col_fun,rect_gp = gpar(type = "none"),
                show_column_dend = FALSE, show_row_dend = FALSE,
                    column_order = colnames(bmat),row_order = rownames(bmat),
                    row_names_side = "left", column_names_side = "top", 
                 column_title = paste(hm_title,"\nSecond boundary"), row_title = "First boundary",
                    cell_fun = function(j, i, x, y, width, height, fill) {
                        if(i <= j) {
                            if(bnames[j]=="OTHER" | bnames[i]=="OTHER") {
                                grid.rect(x = x, y = y, width = width*0.8, height = height*0.8, gp = gpar(fill = col_fun(bmat[i, j]),col=NA))
                            } else {
                                grid.circle(x = x, y = y, r = (comp_value(rownames(bmat)[i],colnames(bmat)[j])-2)/8 * min(unit.c(width, height)), gp = gpar(fill = col_fun(bmat[i, j]), col = NA))
                            }
                            grid.text( ifelse(bmat[i, j]!=0,bmat[i, j],""), x, y, gp = gpar(fontsize = 10))
                            
                        } 
                        
                })) 
                dev.off() 
                
                
                
                
                
                
                

            }
        }
    }
}


# Cindex
#############################################
#for(dep in names(seq_groups)) {
for(dep in c("ALL")) {
    for(size_gname in names(seq_groups[[dep]])) {
        cur_img_dir=gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/"))
        dir.create(cur_img_dir,showWarnings=F,  recursive=T)
            
        pdf(gsub(" ","_",paste0(cur_img_dir,"plot_cindex",dep,"_",size_gname,".pdf")))
                
        plot(NULL,xlim=c(0,11),ylim=c(0,1),axes=F,ylab="Density",xlab="Cindex",main=paste(dep,size_gname))
        axis(2)
        axis(1,at=seq(0.5,11,1),labels=seq(0,10,1))

        for(timing_gname in names(seq_groups[[dep]][[size_gname]])) {
            
            
            current_ies_ids=seq_groups[[dep]][[size_gname]][[timing_gname]]
            print(paste(dep,size_gname,timing_gname,length(current_ies_ids)))
            #
            h<-hist(ies[current_ies_ids,]$CINDEX,breaks=seq(0,11,1),plot=F)
            lines(h$mids,h$density,col=ifelse(timing_gname=="timings","black",gcolors[timing_gname]),lwd=2)
   
        }
        legend("topright",legend=names(seq_groups[[dep]][[size_gname]]),lwd=2,bty="n",col=c("black",gcolors[group_names]))
        
        dev.off()
    }
}





















results=c()
for(dep in names(seq_groups)) {
    res=c()
    for(size_gname in names(size_groups)) {
        dep_size_ies_ids=seq_groups[[dep]][[size_gname]][["timings"]]
        res=c(res,length(dep_size_ies_ids))
    }    
    results=rbind(results,res)
}
rownames(results)=names(seq_groups)
colnames(results)=names(size_groups)

pdf(paste0(img_dir,"/barplot_nb_IESs_Dependencies_VS_sizes_groups.pdf"),width=10)
par(mfrow=c(1,2),mar=c(9.1, 4.1, 4.1, 2.1),xpd=T)
barplot(t(results),beside=T,col=size_group_colors,ylab="Number of IESs",las=2)
barplot(t(results/rowSums(results)),col=size_group_colors,ylab="Proportion",las=2,ylim=c(0,1.19))
legend("top",horiz=T,bty="n",legend=gsub("_"," ",names(size_groups)),pch=15,col=size_group_colors)
dev.off()

write.table(results,paste0(img_dir,"/barplot_nb_IESs_Dependencies_VS_sizes_groups.tsv"),sep="\t",quote=F)



results=c()
for(timing_gname in group_names[group_names!="None"]) {
    res=c()
    for(size_gname in names(size_groups)) {
        timing_size_ies_ids=seq_groups[["ALL"]][[size_gname]][[timing_gname]]
        res=c(res,length(timing_size_ies_ids))
    }    
    results=rbind(results,res)
}
rownames(results)=group_names[group_names!="None"]
colnames(results)=c(names(size_groups))

pdf(paste0(img_dir,"/barplot_nb_IESs_Timing_VS_sizes_groups.pdf"),width=10)
par(mfrow=c(1,2),mar=c(9.1, 4.1, 4.1, 2.1),xpd=T)
barplot(t(results),beside=T,col=c(size_group_colors),ylab="Number of IESs",las=2)
barplot(t(results/rowSums(results)),col=c(size_group_colors),ylab="Proportion",las=2,ylim=c(0,1.19))
legend("top",horiz=T,bty="n",legend=gsub("_"," ",names(size_groups)),pch=15,col=size_group_colors)
dev.off()




pdf(paste0(img_dir,"/barplot_nb_IESs_sizes_groups_VS_Timing.pdf"),width=10)
par(mfrow=c(1,2),mar=c(7.1, 4.1, 4.1, 2.1),xpd=T)
barplot(results,beside=T,col=gcolors,ylab="Number of IESs",las=2,names.arg=gsub("_"," ",names(size_groups)))
barplot(t(t(results)/apply(results,2,sum)),col=gcolors,ylab="Proportion",las=2,ylim=c(0,1.19),names.arg=gsub("_"," ",names(size_groups)))
legend("top",horiz=T,bty="n",legend=group_names,pch=15,col=gcolors)
dev.off()


write.table(results,paste0(img_dir,"/barplot_nb_IESs_Timing_VS_sizes_groups.tsv"),sep="\t",quote=F)

results=c()
for(dep in names(seq_groups)) {
    for(size_gname in names(seq_groups[[dep]])) {
        for(timing_gname in names(seq_groups[[dep]][[size_gname]])) {
            results=rbind(results,c(dep,size_gname,timing_gname,length(seq_groups[[dep]][[size_gname]][[timing_gname]])))
        }
    }
}
colnames(results)=c("Dependency","Size","Timings","Number")

write.table(results,paste0(img_dir,"/stats.tsv"),sep="\t",quote=F,row.names=F)


for(dep in names(ies_sensitivity_classes)) {
    dep_ies_ids= intersect(ies_sensitivity_classes[[dep]],seq_groups[["ALL"]][["sizes"]][["timings"]])
    results=c()
    for(size_gname in names(size_groups)) {
        res=c()
        for(timing_gname in group_names) {
            
            
            current_ies_ids=seq_groups[["ALL"]][[size_gname]][[timing_gname]]
            res=c(res,length(intersect(dep_ies_ids,current_ies_ids))/length(dep_ies_ids))
                
                
        }
        results=rbind(results,res)
        rownames(results)[nrow(results)]=size_gname
    }
    colnames(results)=group_names
    results=t(results)
    
    
    results=c()
    for(size_gname in names(size_groups)) {
        res=c()
        for(timing_gname in group_names) {
            
            
            current_ies_ids=seq_groups[["ALL"]][[size_gname]][[timing_gname]]
            #res=c(res,length(current_ies_ids))
            res=c(res,length(intersect(dep_ies_ids,current_ies_ids))/length(current_ies_ids))
                
                
        }
        results=rbind(results,res)
        rownames(results)[nrow(results)]=size_gname
    }
    colnames(results)=group_names
    results=t(results)    
    
    
}
    
    

weblogo_bases=c("A","C","G","T")
weglogo="/usr/local/src/anaconda/anaconda3/bin/weblogo"


#~ for(dep in names(seq_groups)) {
for(dep in c("ALL","ONLY_EXC_CPLX","TFIIS4")) {
    for(size_gname in names(seq_groups[[dep]])) {
        for(timing_gname in names(seq_groups[[dep]][[size_gname]])) {
            
            
            cur_img_dir=gsub(" ","_",paste0(img_dir,dep,"/",size_gname,"/",timing_gname,"/"))
            dir.create(cur_img_dir,showWarnings=F,  recursive=T)
            current_ies_ids=seq_groups[[dep]][[size_gname]][[timing_gname]]
            print(paste(dep,size_gname,timing_gname,length(current_ies_ids)))
            if(length(current_ies_ids) > 2) {
                for(boundary_side in c("LEFT","RIGHT")) {
                    out_name = gsub(" ","_",paste(dep,size_gname,timing_gname,boundary_side,sep="_"))
                    write.fasta(as.list(boundaries[current_ies_ids,boundary_side]), current_ies_ids, paste0(cur_img_dir,"IES_boundary_seq.fa"), open = "w", nbchar = 60, as.string = FALSE)
                    #system(paste0("cat ",base_result_dir,cat_seq,"_IES_consensus.fa | ",weglogo," -A dna -c classic --units probability --resolution 300 --title '",cat_seq," IES' --format PDF > ",cur_img_dir,cat_seq,"_IES_consensus_weblogo_prop.pdf"))
                    
                    web_logo_title=substr(paste0("N=",length(current_ies_ids)," ",gsub("_"," ",out_name)),1,30)
                    
                    system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format PDF > ",cur_img_dir,out_name,"_weblogo_bits.pdf"))
                    system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",paste0("N=",length(current_ies_ids)),"' --format PNG > ",cur_img_dir,out_name,"_weblogo_bits.png"))
                    system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units probability --composition 0.28  --resolution 300 --title '",paste0("N=",length(current_ies_ids)),"' --format PNG > ",cur_img_dir,out_name,"_weblogo_prop.png"))
                    system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format logodata > ",cur_img_dir,out_name,"_weblogo_bits.tsv"))
                    unlink(paste0(cur_img_dir,"IES_boundary_seq.fa"))
                }
                out_name = gsub(" ","_",paste(dep,size_gname,timing_gname,"BOTH",sep="_"))
                write.fasta(as.list(c(boundaries[current_ies_ids,"LEFT"],boundaries[current_ies_ids,"RIGHT"])), c(paste0(current_ies_ids,"_LEFT"),paste0(current_ies_ids,"_RIGHT")), paste0(cur_img_dir,"IES_boundary_seq.fa"), open = "w", nbchar = 60, as.string = FALSE)
                #system(paste0("cat ",base_result_dir,cat_seq,"_IES_consensus.fa | ",weglogo," -A dna -c classic --units probability --resolution 300 --title '",cat_seq," IES' --format PDF > ",cur_img_dir,cat_seq,"_IES_consensus_weblogo_prop.pdf"))
                
                web_logo_title=substr(paste0("N=",length(current_ies_ids)*2," ",gsub("_"," ",out_name)),1,30)
                   
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format PDF > ",cur_img_dir,out_name,"_weblogo_bits.pdf"))
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",paste0("N=",length(current_ies_ids)*2),"' --format PNG > ",cur_img_dir,out_name,"_weblogo_bits.png"))
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units probability --composition 0.28  --resolution 300 --title '",paste0("N=",length(current_ies_ids)*2),"' --format PNG > ",cur_img_dir,out_name,"_weblogo_prop.png"))
                system(paste0("cat ",cur_img_dir,"IES_boundary_seq.fa | ",weglogo," -A dna -c classic --units bits --composition 0.28  --resolution 300 --title '",web_logo_title,"' --format logodata > ",cur_img_dir,out_name,"_weblogo_bits.tsv"))
                unlink(paste0(cur_img_dir,"IES_boundary_seq.fa"))
                
            }
            
        }
    }
}




