

source("../headers.R")
source("../functions.R")
source("load_data.R")
source("load_retention.R")



base_img_dir=paste0("images/")
base_result_dir=paste0("results/")

dir.create(base_img_dir,showWarnings=F,  recursive=T)
dir.create(base_result_dir,showWarnings=F,  recursive=T)



img_dir=paste0(base_img_dir,"Clustering/")
dir.create(img_dir,showWarnings=F,  recursive=T)

seqlength=read.table("../data/ptetraurelia_mac_51.seqlength",h=F)
colnames(seqlength)=c("SEQ_ID","SEQ_LENGTH")
seqlength=seqlength[order(seqlength$SEQ_LENGTH,decreasing=T),]
seqlength$REVERSE=NA
rownames(seqlength)=seqlength$SEQ_ID

ies=read.table("../data/internal_eliminated_sequence_PGM_IES51_features.tab",h=T,sep="\t",stringsAsFactor=F)
rownames(ies)=ies$ID


ies$RELATIVE_POSITION=ies$POSITION/ies$SEQ_ID_LENGTH

seq_ids=seqlength[seqlength$SEQ_LENGTH>30000,]$SEQ_ID
for(seq_id in seq_ids) {
    nb_ies_seq_id=nrow(ies[ies$SEQ_ID==seq_id,])
    seqlength[seq_id,]$REVERSE = ifelse( sum(ies[ies$SEQ_ID==seq_id,]$RELATIVE_POSITION < 0.25) / nb_ies_seq_id > sum(ies[ies$SEQ_ID==seq_id,]$RELATIVE_POSITION > 0.75) / nb_ies_seq_id, FALSE, TRUE)
}

for(seq_id in seq_ids) {
    
    if(seqlength[seq_id,]$REVERSE) {
        ies[ies$SEQ_ID==seq_id,]$RELATIVE_POSITION = 1-ies[ies$SEQ_ID==seq_id,]$RELATIVE_POSITION
    }
}


complete_seq_ids=as.vector(read.table("complete_MAC_scaffold.lst")[,1])

gff=read.table("../data/ptetraurelia_mac_51_annotation_v2.0.gff3",h=F,sep="\t")
gff=gff[which(is.element(gff$V1,seq_ids)),]
gff=merge(gff,seqlength,by.x="V1",by.y="SEQ_ID",all.x=T)
gff$RELATIVE_POSITION = rowMeans(gff[,c("V4","V5")]) / gff$SEQ_LENGTH

for(seq_id in seq_ids) {
    
    if(seqlength[seq_id,]$REVERSE) {
        gff[gff$V1==seq_id,]$RELATIVE_POSITION = 1-gff[gff$V1==seq_id,]$RELATIVE_POSITION 
    }
}

cds=gff[gff$V3=="CDS",]
gene=gff[gff$V3=="gene",]


gene_genome_cov=read.table("ptetraurelia_mac_51_annotation_v2.0.CDS.cov.tsv",sep="\t",h=F,stringsAsFactor=F)
colnames(gene_genome_cov)= c("SEQ_ID","START","END","COV")
gene_genome_cov$LENGTH=gene_genome_cov$END-gene_genome_cov$START
gene_genome_cov=gene_genome_cov[which(is.element(gene_genome_cov$SEQ_ID,seq_ids)),]

binGeneCoverage<-function(norm_seq_cov,seq_length,bin=100,reverse=FALSE) {
    seq_cov=rep(norm_seq_cov$COV,norm_seq_cov$LENGTH)
    intervals=round(seq(1,seq_length,seq_length/bin),0)
    bin_cov=c()
    for(i in 2:length(intervals)) {
        bin_cov=c(bin_cov,sum(seq_cov[intervals[i-1]:intervals[i]]))
    }
    if(reverse==TRUE) {
        bin_cov=rev(bin_cov)
    }
    bin_cov
}

bin_gene_cov=500
gene_cov=c()
for(seq_id in seq_ids) {
    gene_cov=rbind(gene_cov,binGeneCoverage(gene_genome_cov[gene_genome_cov$SEQ_ID==seq_id,],seq_length=seqlength[seq_id,]$SEQ_LENGTH, reverse=seqlength[seq_id,]$REVERSE,bin=bin_gene_cov))
#~     print(paste(seq_id,length(binGeneCoverage(gene_genome_cov[gene_genome_cov$SEQ_ID==seq_id,],seq_length=seqlength[seq_id,]$SEQ_LENGTH, reverse=seqlength[seq_id,]$REVERSE))))
}
barplot(apply(gene_cov,2,sum))
plot(seq(0,1,length.out=ncol(gene_cov)),apply(gene_cov,2,sum),type="l")




dated=read.table("../data/date_internal_eliminated_sequence_MIC_tetraurelia.pt_51.tab",h=T,sep="\t",stringsAsFactor=F)
dated=dated[,c("ID_PGM","LCA_ageclass","LCA_age")]
dated=dated[!is.na(dated$LCA_ageclass) & !is.na(dated$ID_PGM),]
rownames(dated)=dated$ID_PGM


highly_conserved=read.table("../data/ptet_HighlyConserved_IESs.tsv",h=T,sep="\t",stringsAsFactor=F)


groups=read.table("SupTableS5.tsv",sep="\t",h=T)[,c("ID","GROUP_NAME","GROUP_RANK")]
rownames(groups)=groups$ID
group_names=c("Very early","Early","Intermediate","Late"
#,"None"
)
gcolors=c("#E41A1C","#FF7F00","#4DAF4A","dodgerblue"
#,"grey"
)

names(gcolors)=group_names

peaks = c(25,34, 42, 52, 63, 72, 82, 91, 102,111,200,max(ies$SIZE))


# Length

pdf(paste0(img_dir,"/ecdf_IES_length.pdf"))
plot(ecdf(ies$SIZE),xlim=c(25,200),xlab="IES length (bp)",main="",cex=1.3,cex.axis=1.3,cex.lab=1.3)
for(gname in group_names) {
    group_ies_ids= groups[groups$GROUP_NAME==gname,]$ID
    lines(ecdf(ies[group_ies_ids,]$SIZE),col=gcolors[gname])
    
}
legend("right",legend=c("All",group_names),col=c("black",gcolors[group_names]),lwd=2,bty="n",cex=1.3)
dev.off()





pdf(paste0(img_dir,"/hist_density_IES_length.pdf"))
plot(NULL,xlim=c(25,150),ylim=c(0,0.15),xlab="IES length (bp)",ylab="IES fraction",main="",cex=1.3,cex.axis=1.3,cex.lab=1.3)
for(gname in group_names) {
    group_ies_ids= groups[groups$GROUP_NAME==gname,]$ID
    h<-hist(ies[group_ies_ids,]$SIZE,breaks=seq(0,max(ies$SIZE)),plot=F)
    lines(h$mids,h$density,lwd=2,col=gcolors[gname])

    
}
h<-hist(ies$SIZE,breaks=seq(0,max(ies$SIZE)),plot=F)
lines(h$mids,h$density,lwd=1,col="black")
legend("right",legend=c("All",group_names),col=c("black",gcolors[group_names]),lwd=2,lty=c(5,rep(1,4)),bty="n",cex=1.3)
dev.off()






results=c(table(groups$GROUP_NAME)[group_names] /nrow(groups[groups$GROUP_NAME!="None",]))
nbs=c(nrow(groups))
for(s in 2:length(peaks)) {
    cur_ies_ids =intersect(ies[ies$SIZE>=peaks[s-1] & ies$SIZE<peaks[s],]$ID,groups[groups$GROUP_NAME!="None",]$ID)
    results=rbind(results,table(groups[cur_ies_ids,]$GROUP_NAME)[group_names] /length(cur_ies_ids))
    nbs=c(nbs,length(cur_ies_ids))
}


rownames(results)=c("All",paste0(peaks[1:(length(peaks)-1)],"-",peaks[2:length(peaks)],"nt"))
    
#~ pdf(paste0(img_dir,"/GroupProp_in_each_SizePeaks.pdf"))
#~ par(mar=c(6.1, 4.1, 4.1, 2.1))
#~ bp<-barplot(t(results),las=2,main="",ylab="IES proportion",xlab="",col=gcolors,ylim=c(0,1.15))
#~ text(bp,1.02,nbs)
#~ legend("topleft",legend=colnames(results),col=gcolors,pch=15,horiz=T,bty="n")
#~ dev.off()

pdf(paste0(img_dir,"/GroupPropLines_in_each_SizePeaks.pdf"))
par(mar=c(6.1, 4.1, 4.1, 2.1))
plot(NULL,ylim=c(0,max(results)),xlim=c(2,nrow(results)),ylab="Proportion",xlab="IES length peaks",axes=F)
axis(2)
axis(1,at=seq(2,nrow(results)),labels=rownames(results)[-1],las=2)
for(gname in group_names) {
    group_ies_ids= groups[groups$GROUP_NAME==gname,]$ID
    lines(2:nrow(results),results[-1,gname],lwd=2,col=gcolors[gname])
    abline(h=results[1,gname],lty=2,col=gcolors[gname])
}
legend("bottom",legend=group_names,col=gcolors,lwd=2,bty="n")
dev.off()

results=c()
nbs=c()
for(s in 2:length(peaks)) {
    cur_ies_ids =intersect(ies[ies$SIZE>=peaks[s-1] & ies$SIZE<peaks[s],]$ID,groups[groups$GROUP_NAME!="None",]$ID)
    nbs=c(nbs,length(cur_ies_ids))
    res=c(length(cur_ies_ids)/nrow(groups[groups$GROUP_NAME!="None",]))
    for(gname in group_names) {
        res=c(res,length(intersect(groups[groups$GROUP_NAME==gname,]$ID,cur_ies_ids))/nrow(groups[groups$GROUP_NAME==gname,]))
    }
    results=rbind(results,res)
    rownames(results)[nrow(results)]=paste0(peaks[s-1],"-",peaks[s]-1)
}
colnames(results)=c("All",group_names)

pdf(paste0(img_dir,"/GroupProp_in_each_SizePeaks.pdf"),width=12)
barplot(t(results),beside=T,main="",ylab="IES fraction",xlab="IES length (nt)",col=c("gray",gcolors),border="white")
legend("top",legend=c("All",group_names),col=c("gray",gcolors),pch=15,bty="n",ncol=2)
dev.off()



sizes=c(25,34,42,141,141,max(ies$SIZE))
nbs=c()

results=c()
for(s in seq(1,length(sizes),2)) {
    cur_ies_ids =intersect(ies[ies$SIZE>=sizes[s] & ies$SIZE<sizes[s+1],]$ID,groups[groups$GROUP_NAME!="None",]$ID)
    nbs=c(nbs,length(cur_ies_ids))
    res=c(length(cur_ies_ids)/nrow(groups[groups$GROUP_NAME!="None",]))
    for(gname in group_names) {
        res=c(res,length(intersect(groups[groups$GROUP_NAME==gname,]$ID,cur_ies_ids))/nrow(groups[groups$GROUP_NAME==gname,]))
    }
    results=rbind(results,res)
    rownames(results)[nrow(results)]=paste0(sizes[s],"-",sizes[s+1]-1," nt")
}
colnames(results)=c("All",group_names)
rownames(results)[nrow(results)]=paste0(">",sizes[s]-1," nt")

pdf(paste0(img_dir,"/GroupProp_in_each_SizeCategory.pdf"))
barplot(t(results),beside=T,main="",ylab="IES fraction",xlab="IES length (nt)",col=c("gray",gcolors),border="white")
legend("topright",legend=c("All",group_names),col=c("gray",gcolors),pch=15,bty="n")
dev.off()


results=c(table(groups$GROUP_NAME)[group_names] /nrow(groups[groups$GROUP_NAME!="None",]))
for(s in seq(1,length(sizes),2)) {
    cur_ies_ids =intersect(ies[ies$SIZE>=sizes[s] & ies$SIZE<sizes[s+1],]$ID,groups[groups$GROUP_NAME!="None",]$ID)
    results=rbind(results,table(groups[cur_ies_ids,]$GROUP_NAME)[group_names]/length(cur_ies_ids))
    rownames(results)[nrow(results)]=paste0(sizes[s],"-",sizes[s+1]-1," nt")
}
rownames(results)[1]="All"
rownames(results)[nrow(results)]=paste0(">",sizes[s]-1," nt")


pdf(paste0(img_dir,"/GroupProp_in_each_SizeCategory_v2.pdf"))
barplot(t(results),main="",ylab="IES fraction",xlab="IES length (nt)",col=gcolors,border="white")
dev.off()


# location


pdf(paste0(img_dir,"/IES_density_along_scaffolds.pdf"))
bin_density=0.05
h<-hist(ies[intersect(groups[groups$GROUP_NAME!="None",]$ID,ies[which(is.element(ies$SEQ_ID,seq_ids)),]$ID),]$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)
plot(h$mids,h$density,ylim=c(0,2),type="l",xlab="Relative scaffold position",ylab="IES density",lwd=2)
for(gname in group_names[group_names!="None"]) {
    group_ies_ids= groups[groups$GROUP_NAME==gname,]$ID
    group_ies_ids= intersect(group_ies_ids,ies[which(is.element(ies$SEQ_ID,seq_ids)),]$ID)
    hg<-hist(ies[group_ies_ids,]$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)
    lines(hg$mids,hg$density,lwd=2,col=gcolors[gname])
}
#legend("bottom",legend=c("All IES",group_names),col=c("black",gcolors),lwd=2,bty="n")
par(new=T)

h<-hist(cds[which(is.element(cds$V1,seq_ids)),]$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)
#~ h<-hist(gene$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)

plot(h$mids,h$density,ylim=c(0.8,1.1),type="l",xlab="",ylab="",lwd=2,lty=2,col="darkgrey",axes=F)
axis(4)
#legend("bottom",legend=c("CDS"),col=c("darkgrey"),lwd=2,lty=2,bty="n")

dev.off()




pdf(paste0(img_dir,"/IES_density_along_complete_scaffolds.pdf"))
bin_density=0.05
h<-hist(ies[intersect(groups[groups$GROUP_NAME!="None",]$ID,ies[which(is.element(ies$SEQ_ID,complete_seq_ids)),]$ID),]$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)
plot(h$mids,h$density,ylim=c(0,2.5),type="l",xlab="Relative scaffold position",ylab="IES density",lwd=2)
for(gname in group_names[group_names!="None"]) {
    group_ies_ids= groups[groups$GROUP_NAME==gname,]$ID
    group_ies_ids= intersect(group_ies_ids,ies[which(is.element(ies$SEQ_ID,complete_seq_ids)),]$ID)
    hg<-hist(ies[group_ies_ids,]$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)
    lines(hg$mids,hg$density,lwd=2,col=gcolors[gname])
}
#legend("bottom",legend=c("All IES",group_names),col=c("black",gcolors),lwd=2,bty="n")
par(new=T)

h<-hist(cds[which(is.element(cds$V1,complete_seq_ids)),]$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)
#~ h<-hist(gene$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)

plot(h$mids,h$density,ylim=c(0.8,1.1),type="l",xlab="",ylab="",lwd=2,lty=2,col="darkgrey",axes=F)
axis(4)
#legend("bottom",legend=c("CDS"),col=c("darkgrey"),lwd=2,lty=2,bty="n")
dev.off()





pdf(paste0(img_dir,"/IES_density_ratio_along_scaffolds.pdf"))
h<-hist(ies[intersect(groups[groups$GROUP_NAME!="None",]$ID,ies[which(is.element(ies$SEQ_ID,seq_ids)),]$ID),]$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)
plot(NULL,xlim=c(0,1),ylim=c(-1,1),xlab="Relative scaffold position",ylab="IES density enrichment (log2)")
for(gname in group_names[group_names!="None"]) {
    group_ies_ids= groups[groups$GROUP_NAME==gname,]$ID
    group_ies_ids= intersect(group_ies_ids,ies[which(is.element(ies$SEQ_ID,seq_ids)),]$ID)
    hg<-hist(ies[group_ies_ids,]$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)
    lines(h$mids,log2(hg$density/h$density),lwd=2,col=gcolors[gname])
    print(max(log2(hg$density/h$density)))
    #plot(hg$mids,h$density,type="l")
}
#legend("bottom",legend=group_names,col=gcolors,lwd=2,bty="n")
dev.off()



pdf(paste0(img_dir,"/IES_density_ratio_along_complete_scaffolds.pdf"))
h<-hist(ies[intersect(groups[groups$GROUP_NAME!="None",]$ID,ies[which(is.element(ies$SEQ_ID,complete_seq_ids)),]$ID),]$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)
plot(NULL,xlim=c(0,1),ylim=c(-1,1),xlab="Relative scaffold position",ylab="IES density enrichment (log2)")
for(gname in group_names[group_names!="None"]) {
    group_ies_ids= groups[groups$GROUP_NAME==gname,]$ID
    group_ies_ids= intersect(group_ies_ids,ies[which(is.element(ies$SEQ_ID,complete_seq_ids)),]$ID)
    hg<-hist(ies[group_ies_ids,]$RELATIVE_POSITION,breaks=seq(0,1,bin_density),plot=F)
    lines(h$mids,log2(hg$density/h$density),lwd=2,col=gcolors[gname])
    print(max(log2(hg$density/h$density)))
    #plot(hg$mids,h$density,type="l")
}
#legend("bottom",legend=group_names,col=gcolors,lwd=2,bty="n")
dev.off()



# Gene bais




results=c()
for(cname in c("IN_GENE","IN_CDS","IN_INTRON")){
    res=c(round(sum(ies[,cname])/nrow(ies)*100,1))
    pvalues=c(NA)
    for(gname in group_names[group_names!="None"]) {
        group_ies_ids= groups[groups$GROUP_NAME==gname,]$ID
        ies_cur=ies[group_ies_ids,]
        res=c(res,round(sum(ies_cur[,cname])/nrow(ies_cur)*100,1))
        
        m=rbind(c(sum(ies[,cname]),nrow(ies)-sum(ies[,cname])),c(sum(ies_cur[,cname]),nrow(ies_cur)-sum(ies_cur[,cname])))
        pvalues=c(pvalues,chisq.test(m, correct=F)$p.value)

    }
    results=cbind(results,res,pvalues)
    colnames(results)[(ncol(results)-1):ncol(results)]=paste0(c("Percent_","P-Value_"),cname)
}
rownames(results)=c("All",group_names[group_names!="None"])
results

write.table(results,paste0(img_dir,"/IES_Gene_distribution.tsv"),sep="\t",quote=F,row.names=T)





d=list(
NON_CODING=ies[!ies$IN_GENE,],
CODING_STOPWITH=ies[ies$IN_CDS & ies$PTC,],
CODING_STOPLESS=ies[ies$IN_CDS & !ies$PTC,]
)

results=c()
for(dname in names(d)) {
    ies_cur=d[[dname]]
    n3=as.vector(table(ies_cur$N3)[c("0","1","2")])
    results=rbind(results,c(dname,"All",nrow(ies_cur),round(n3/nrow(ies_cur)*100,0),NA))
    
    for(gname in group_names[group_names!="None"]) {
        group_ies_ids= groups[groups$GROUP_NAME==gname,]$ID
        ies_cur_group=ies_cur[intersect(rownames(ies_cur),group_ies_ids),]
        n3_group=as.vector(table(ies_cur_group$N3)[c("0","1","2")])
        
        m=rbind(c(n3),n3_group)
        
        results=rbind(results,c(NA,gname,nrow(ies_cur_group),round(n3_group/nrow(ies_cur_group)*100,0),chisq.test(m, correct=F)$p.value))

    }
}

colnames(results)=c("Category","Group","Number","3n","3n+1","3n+2","P-value")

write.table(results,paste0(img_dir,"/IES_3n_distribution.tsv"),sep="\t",quote=F,row.names=F)









# age

dated_ies_ids=intersect(rownames(dated),groups[groups$GROUP_NAME!="None",]$ID)

results=c()
for(s in seq(1,length(sizes),2)) {
    cur_ies_ids =intersect(ies[ies$SIZE>=sizes[s] & ies$SIZE<sizes[s+1],]$ID,dated_ies_ids)
    
    res=c(length(cur_ies_ids)/length(dated_ies_ids))
    for(age in c("Old","Aurelia","Recent")) {
        age_ies_ids =intersect(rownames(dated[dated$LCA_ageclass==age,]),dated_ies_ids)
        
        res=c(res,length(intersect(age_ies_ids,cur_ies_ids))/length(age_ies_ids))
    }
    results=rbind(results,res)
    rownames(results)[nrow(results)]=paste0(sizes[s],"-",sizes[s+1]-1," nt")
}
colnames(results)=c("All","Old","Aurelia","Recent")
rownames(results)[nrow(results)]=paste0(">",sizes[s]-1," nt")

dated_colors=c("salmon",brewer.pal(3,"Set1"))

#~ pdf(paste0(img_dir,"/IES_Ages_by_classes_of_Sizes.pdf"),width=5)
#~ par(mar=c(6.1, 4.1, 4.1, 2.1), xpd=TRUE)
#~ bp<-barplot(t(results),beside=T,las=2,main="",col=dated_colors,ylab="IES proportion",xlab="",border="white",ylim=c(0,1.15))
#~ dev.off()

dated_colors=rev(brewer.pal(3,"Greys"))[1:nrow(results)]

pdf(paste0(img_dir,"/IES_Ages_by_classes_of_Sizes_NB.pdf"),width=6)
bp<-barplot(results,main="",col=dated_colors,ylab="IES proportion",xlab="",border="white",ylim=c(0,1.15))
legend("top",horiz=T,legend=rownames(results),pch=15,col=dated_colors,bty="n",cex=1.3)
dev.off()





results=c(table(groups$GROUP_NAME)[group_names] /nrow(groups[groups$GROUP_NAME!="None",]))
nbs=c(nrow(groups))
for(age in c("Old","Aurelia","Recent","Leaf")) {
    cur_ies_ids =intersect(rownames(dated[dated$LCA_ageclass==age,]),groups[groups$GROUP_NAME!="None",]$ID)
    results=rbind(results,table(groups[cur_ies_ids,]$GROUP_NAME)[group_names] /length(cur_ies_ids))
    nbs=c(nbs,length(cur_ies_ids))
    
    m=rbind(table(groups$GROUP_NAME)[group_names],table(groups[cur_ies_ids,]$GROUP_NAME)[group_names])
    print(paste(age,chisq.test(m, correct=F)$p.value))
}
results[is.na(results)]=0
rownames(results)=c("All",c("Old","Aurelia","Recent","Leaf"))


pdf(paste0(img_dir,"/GroupProp_in_each_IES_Age_classes.pdf"))
par(mar=c(4.1, 4.1, 4.1, 2.1), xpd=TRUE)
bp<-barplot(t(results),las=2,main="",ylab="IES proportion",xlab="",col=c(gcolors[group_names]),border="white",ylim=c(0,1.15))
text(bp,1.03,nbs)
legend("topleft",legend=colnames(results),col=c(gcolors[group_names]),pch=15,horiz=T,bty="n")
dev.off()




cur_ies_ids =intersect(rownames(dated),groups[groups$GROUP_NAME!="None",]$ID)
results=c(table(dated[cur_ies_ids,]$LCA_ageclass)[c("Old","Aurelia","Recent","Leaf")]/length(cur_ies_ids))
nbs=c(length(cur_ies_ids))
for(gname in c("Very early","Early","Intermediate","Late")) {     
    cur_ies_ids =intersect(rownames(dated),groups[groups$GROUP_NAME==gname,]$ID)
    results=rbind(results,table(dated[cur_ies_ids,]$LCA_ageclass)[c("Old","Aurelia","Recent","Leaf")]/length(cur_ies_ids))    
    rownames(results)[nrow(results)]=gname
    nbs=c(nbs,length(cur_ies_ids))
}
results[is.na(results)]=0

pdf(paste0(img_dir,"/IES_Age_classes_in_each_Groups.pdf"))
dated_colors=brewer.pal(ncol(results),"Dark2")
par(mar=c(6.1, 4.1, 4.1, 2.1), xpd=TRUE)
bp<-barplot(t(results),las=2,main="",ylab="IES proportion",xlab="",col=dated_colors,border="white",ylim=c(0,1.15))
text(bp,1.03,nbs)
legend("topleft",legend=colnames(results),col=dated_colors,pch=15,horiz=T,bty="n")
dev.off()

      
 
results=c(table(groups$GROUP_NAME)[group_names] /nrow(groups[groups$GROUP_NAME!="None",]))
nbs=c(nrow(groups[groups$GROUP_NAME!="None",]))
cur_ies_ids =intersect(highly_conserved$ID,groups[groups$GROUP_NAME!="None",]$ID)
results=rbind(results,table(groups[cur_ies_ids,]$GROUP_NAME)[group_names] /length(cur_ies_ids))
nbs=c(nbs,length(cur_ies_ids))
rownames(results)=c("All","HighlyConserved")
         
 par(mar=c(10.1, 4.1, 4.1, 2.1), xpd=TRUE)
bp<-barplot(t(results),las=2,main="",ylab="IES proportion",xlab="",col=c(gcolors[group_names]),border="white",ylim=c(0,1.15))
text(bp,1.03,nbs)
legend("topleft",legend=colnames(results),col=c(gcolors[group_names]),pch=15,horiz=T,bty="n")
     
      

      
      
      
# sensitivity
    
ies_sensitivity_classes=list()
for(cl in c("EZL1","TFIIS4","DCL23","DCL5","ONLY_EXC_CPLX","PGM")) {
  ies_sensitivity_classes[[cl]]=  as.vector(retention[retention[,paste0("SIGNIFICANT_",cl)] ,]$ID)
}

ies_sensitivity_classes[["EZL1_ONLY"]]= setdiff(ies_sensitivity_classes[["EZL1"]],unique(c(ies_sensitivity_classes[["TFIIS4"]],ies_sensitivity_classes[["DCL23"]],ies_sensitivity_classes[["DCL5"]])))

vinput=list()
for(cl in c("EZL1","TFIIS4","DCL23","DCL5")) {
   cur_ies_ids =intersect(ies_sensitivity_classes[[cl]],groups[groups$GROUP_NAME!="None",]$ID)
   vinput[[cl]]=cur_ies_ids
}

pdf(paste0(img_dir,"/venn_",paste(names(vinput),collapse="_"),".pdf"))
venn(vinput)
dev.off()

pdf(paste0(img_dir,"/venn_",paste(names(vinput),collapse="_"),"_simplified.pdf"))
venn(vinput,simplify=T)
dev.off()





results=c(table(groups$GROUP_NAME)[group_names] /nrow(groups[groups$GROUP_NAME!="None",]))
nbs=c(nrow(groups[groups$GROUP_NAME!="None",]))
for(cl in names(ies_sensitivity_classes)) {
    cur_ies_ids =intersect(ies_sensitivity_classes[[cl]],groups[groups$GROUP_NAME!="None",]$ID)
    results=rbind(results,table(groups[cur_ies_ids,]$GROUP_NAME)[group_names] /length(cur_ies_ids))
    nbs=c(nbs,length(cur_ies_ids))
    
    m=rbind(table(groups$GROUP_NAME)[group_names],table(groups[cur_ies_ids,]$GROUP_NAME)[group_names])
    m[is.na(m)]=0
    print(paste(cl,chisq.test(m, correct=F)$p.value))
}
results[is.na(results)]=0
rownames(results)=c("All",names(ies_sensitivity_classes))

    
pdf(paste0(img_dir,"/GroupProp_in_each_IESsig_classes.pdf"))
par(mar=c(8.1, 4.1, 4.1, 2.1), xpd=TRUE)
bp<-barplot(t(results),las=2,main="",ylab="IES proportion",xlab="",col=c(gcolors[group_names]),border="white",ylim=c(0,1.15))
text(bp,1.03,nbs)
legend("topleft",legend=colnames(results),col=c(gcolors[group_names]),pch=15,horiz=T,bty="n")
dev.off()



for(age in c("Old","Aurelia","Recent")) {
    cur_ies_ids =intersect(rownames(dated[dated$LCA_ageclass==age,]),groups[groups$GROUP_NAME!="None",]$ID)
    results=rbind(results,table(groups[cur_ies_ids,]$GROUP_NAME)[group_names] /length(cur_ies_ids))
    nbs=c(nbs,length(cur_ies_ids))
}
results[is.na(results)]=0
rownames(results)[(nrow(results)-2):nrow(results)]=c("Old","Aurelia","Recent")


pdf(paste0(img_dir,"/GroupProp_in_each_IESsig_classes_with_Dated.pdf"))
par(mar=c(8.1, 4.1, 4.1, 2.1), xpd=TRUE)
bp<-barplot(t(results),las=2,main="",ylab="IES proportion",xlab="",col=c(gcolors[group_names]),border="white",ylim=c(0,1.15))
text(bp,1.03,nbs)
legend("topleft",legend=colnames(results),col=c(gcolors[group_names]),pch=15,horiz=T,bty="n")
dev.off()


   
pdf(paste0(img_dir,"/GroupProp_in_each_IESsig_classes_beside.pdf"))
par(mar=c(8.1, 4.1, 4.1, 2.1), xpd=TRUE)
bp<-barplot(t(results),beside=T,las=2,main="",ylab="IES proportion",xlab="",col=c(gcolors[group_names]),ylim=c(0,0.6))
legend("topleft",legend=colnames(results),col=c(gcolors[group_names]),pch=15,bty="n")
dev.off()
   
   


pdf(paste0(img_dir,"/IESsig_in_each_Group_classes_beside.pdf"),width=8,height=8)
par(mfrow=c(2,2))
for(gname in group_names) {
    res=c()
    for(cl in c("EZL1","TFIIS4","DCL23","DCL5","ONLY_EXC_CPLX")) {
       cur_ies_ids =intersect(ies_sensitivity_classes[[cl]],groups[groups$GROUP_NAME==gname,]$ID)
       res=c(res,length(cur_ies_ids)/length(groups[groups$GROUP_NAME==gname,]$ID))
    }
    names(res)=c("EZL1","TFIIS4","DCL23","DCL5","ONLY_EXC_CPLX")
    barplot(res,col=gcolors[gname],ylim=c(0,1),ylab="IES proportion",main=paste(gname,"IESs"))
}
dev.off()
 



groups$GROUP_RANK_IDX=groups$GROUP_RANK/nrow(groups[groups$GROUP_NAME!="None",])

par(mfrow=c(3,2))
#for(cl in c("EZL1","TFIIS4","ONLY_EXC_CPLX")) {
for(cl in names(ies_sensitivity_classes)[names(ies_sensitivity_classes)!= "PGM"]) {
    cur_ies_ids =intersect(ies_sensitivity_classes[[cl]],groups[groups$GROUP_NAME!="None",]$ID)
    h<-hist(groups[cur_ies_ids,]$GROUP_RANK_IDX,breaks=seq(0,1,0.01),main=cl,xlab="Group rank index")
    
    for(gname in group_names) {
        abline(v=max(groups[groups$GROUP_NAME==gname,]$GROUP_RANK_IDX,na.rm=T),lty=2)
        text(mean(groups[groups$GROUP_NAME==gname,]$GROUP_RANK_IDX,na.rm=T),max(h$counts)+2,gname,col=gcolors[gname])
    }
    
}



    
max_size=200
for(cl in names(ies_sensitivity_classes)) {
    pdf(paste0(img_dir,"/IES_sizes_for_",cl,"-sensitive_IES.pdf"),width=14,height=10)

    par(mfrow=c(2,3))
    hist(ies[ies_sensitivity_classes[[cl]],]$SIZE,breaks=seq(0,max(ies$SIZE)+1,1),xlim=c(25,max_size),main=paste0("IES ",cl,"-sensitive N=",length(ies_sensitivity_classes[[cl]])),xlab="IES length (nt)",col="black",border="white")
        
    for(gname in group_names) {     
        cur_ies_ids =intersect(ies_sensitivity_classes[[cl]],groups[groups$GROUP_NAME==gname,]$ID)
        if(length(cur_ies_ids) > 0) {
            hist(ies[cur_ies_ids,]$SIZE,breaks=seq(0,max(ies$SIZE)+1,1),xlim=c(25,max_size),main=paste0(gname," IES ",cl,"-sensitive N=",length(cur_ies_ids)),xlab="IES length (nt)",col=gcolors[gname],border="white")
        } else {
            plot.new()
        }
    }
    plot(NULL,xlim=c(25,max_size),ylim=c(0,1),main=paste0("IES ",cl,"-sensitive N=",length(ies_sensitivity_classes[[cl]])))
    lines(ecdf(ies[ies_sensitivity_classes[[cl]],]$SIZE),col="black")
    for(gname in group_names) {     
        cur_ies_ids =intersect(ies_sensitivity_classes[[cl]],groups[groups$GROUP_NAME==gname,]$ID)
        if(length(cur_ies_ids) > 0) {
            lines(ecdf(ies[cur_ies_ids,]$SIZE),col=gcolors[gname])
        }
        print(paste(cl,gname,sum(ies[cur_ies_ids,]$SIZE<150)/length(cur_ies_ids)))
    }
    legend("right",title=paste0("IES ",cl,"-sensitive"),legend=c("All",group_names),lwd=2,col=c("black",gcolors[group_names]),bty="n")
    dev.off()
    
}
    
    

    
