
  
source("../headers.R")
source("../functions.R")

library(seqinr)
library(stringr)
library(binom)
library(ggseqlogo)
base_img_dir=paste0("images/")
base_result_dir=paste0("results/")

dir.create(base_img_dir,showWarnings=F,  recursive=T)
dir.create(base_result_dir,showWarnings=F,  recursive=T)



groups=read.table("../IES_Excision_Score/IES_Groups.tsv",h=T,sep="\t")
group_names=c("Very early","Early","Intermediate","Late"
#,"None"
)
gcolors=c("#E41A1C","#FF7F00","#4DAF4A","dodgerblue"
#,"grey"
)

names(gcolors)=group_names



peak_name<-function(x) {
    peaks = c(25,34, 42, 52, 63, 72, 82, 91, 102,111,122,132,141,max(ies$SIZE))
    peak_name=NA
    size=as.numeric(x[2])
    for(s in 2:length(peaks)) {
        size_min=peaks[s-1]
        size_max=peaks[s]
        if(size >= size_min & size < size_max) {
            peak_name = paste0(size_min,"-",size_max-1,"_nt")
        }
    }
    peak_name
}
ies$PEAK=apply(ies[,c("ID","SIZE")],1,peak_name)


ies=read.table("../data/internal_eliminated_sequence_PGM_IES51_features.tab",h=T,sep="\t",stringsAsFactor=F)
rownames(ies)=ies$ID

revCompSeq<-function(x) {
    paste(rev(comp(s2c(x[2]))),collapse="")
}

boundaries=data.frame(ID=ies$ID,GROUP_NAME=groups[ies$ID,]$GROUP_NAME,LENGTH=ies$SIZE,PEAK=ies$PEAK,MAC_FLANK_SEQ=str_extract(ies$MAC_FLANK_SEQ,c("\\w{5}TA\\w{5}")),LEFT=substr(ies$IES_SEQ,0,13),RIGHT=apply(ies[,c("ID","IES_SEQ")],1,get_right_boundary_seq,add_TA=FALSE,boundary_length=13))
rownames(boundaries)=boundaries$ID

conserved=read.table("internal_eliminated_sequence_PGM_IES51.VS.itself.blastn",sep="\t",h=F)
colnames(conserved)=c("QNAME","TNAME","PID","ALN_LENGTH","MISMATCHES","GAPS","QSTART","QEND","TSTART","TEND","EVALUE","SCORE")

conserved=merge(conserved,boundaries[,c("ID","GROUP_NAME","LENGTH","PEAK","LEFT","RIGHT")],by.x="QNAME",by.y="ID",all.x=T)
conserved=merge(conserved,boundaries[,c("ID","GROUP_NAME","LENGTH","PEAK","LEFT","RIGHT")],by.x="TNAME",by.y="ID",all.x=T,suffix=c("_QUERY","_TARGET"))
conserved$R_LENGTH = ifelse(conserved$LENGTH_QUERY < conserved$LENGTH_TARGET, conserved$LENGTH_QUERY / conserved$LENGTH_TARGET , conserved$LENGTH_TARGET/conserved$LENGTH_QUERY)
conserved$CONSERVED_BOUNDARIES= (conserved$LEFT_QUERY==conserved$LEFT_TARGET & conserved$RIGHT_QUERY==conserved$RIGHT_TARGET ) | (conserved$LEFT_QUERY==conserved$RIGHT_TARGET & conserved$RIGHT_QUERY==conserved$LEFT_TARGET )

conserved = conserved[conserved$PEAK_QUERY==conserved$PEAK_TARGET & conserved$R_LENGTH >=0.95,]

round(table(conserved[conserved$CONSERVED_BOUNDARIES,c("GROUP_NAME_QUERY","GROUP_NAME_TARGET")])[group_names,group_names]/nrow(conserved)*100,1)



#########################

ies_on_mac=read.table("internal_eliminated_sequence_PGM_IES51.VS.ptetraurelia_mac_51.blastn",sep="\t",h=F)
colnames(ies_on_mac)=c("QNAME","TNAME","PID","ALN_LENGTH","MISMATCHES","GAPS","QSTART","QEND","TSTART","TEND","EVALUE","SCORE")
ies_on_mac=merge(ies_on_mac,boundaries[,c("ID","GROUP_NAME","LENGTH","PEAK","LEFT","RIGHT")],by.x="QNAME",by.y="ID",all.x=T)
ies_on_mac$R_LENGTH = ifelse(ies_on_mac$ALN_LENGTH < ies_on_mac$LENGTH, ies_on_mac$ALN_LENGTH / ies_on_mac$LENGTH , ies_on_mac$LENGTH/ies_on_mac$ALN_LENGTH)
