
source("../../headers.R")
source("../../functions.R")



img_dir=paste0("images/")
base_result_dir=paste0("results/")

dir.create(img_dir,showWarnings=F,  recursive=T)
dir.create(base_result_dir,showWarnings=F,  recursive=T)



groups=read.table("../../IES_Excision_Score/IES_Groups.tsv",h=T,sep="\t")
group_names=c("Very early","Early","Intermediate","Late")


genome=paste0("/data/PARAMECIUM/REFERENCES/tetraurelia/ptetraurelia_mac_51_with_ies.fa")
basegenome = sub(".fa","",sub(".fasta","",basename(genome)))
ref_pref="pt_51_with_ies"


#~ event_name_category="nested_IES"
#~ event_prefix="NestIES.PTET51.1"

label_colors= colors
data.frame(PREFIX=prefixes,LABEL=labels)

seqlength=read.table("../../data/ptetraurelia_mac_51.seqlength",h=F)
colnames(seqlength)=c("SEQ_ID","SEQ_LENGTH")
seqlength=seqlength[order(seqlength$SEQ_LENGTH,decreasing=T),]
seq_ids=seqlength[seqlength$SEQ_LENGTH>30000,]$SEQ_ID

ies=read.table("/data/PARAMECIUM/GENOMIC/tetraurelia/micronucleus/IES/51/internal_eliminated_sequence_PGM_IES51_features.tab",h=T,sep="\t",stringsAsFactor=F)
ies=merge(ies,read.table("/data/PARAMECIUM/ANALYSIS/tetraurelia/ptetraurelia_mac_51//PGM/SAMPLES/ABK_COSP/ParTIES_DeNovo/Insert/Insert.tab",h=T,sep="\t",stringsAsFactor=F)[,c("ID","START","END")],by="ID",all.x=T,suffix=c("","_MACIES"))
rownames(ies)=ies$ID

ies=ies[which(is.element(ies$SEQ_ID,seq_ids)),]
ies=ies[which(is.element(ies$ID,groups[groups$GROUP_NAME!="None",]$ID)),]




data=list()
event_ids=c()
for(prefix in prefixes) {
    dir=try(system(paste0("find /data/PARAMECIUM/ANALYSIS/tetraurelia/ptetraurelia_mac_51/ -name ",prefix, ".rmdup | head -1"),intern=T))
   file=paste0(dir,"/ParTIES/Compare/Compare.current.tab")
   print(file)
   if(!file.exists(file)) {
      print(paste("No file",file))
   } else {
      tab=read.delim(file,h=T,sep="\t",stringsAsFactors=F)
      

      tab=tab[!tab$IDENTICAL & tab$BOUNDARY_RELATION!='identical',]   
      tab=tab[!is.na(tab$BOUNDARY_RELATION),]
      tab=tab[!tab$IS_OVERLAPPING,]
      
      tab = tab[tab$ERROR_TYPE=="excision error",]
      
      tab = tab[which(is.element(tab$BOUNDARY_RELATION,c("internal","left_internal","right_internal"))),]
      tab=tab[tab$BOUNDED_BY_TA,]
      tab = tab[which(is.element(tab$CLOSEST_FEATURE_ID,ies$ID)),]
      tab = tab[tab$SIZE>=20,]
      if(sum(is.na(tab$SUPPORT_REF))!=0) {
        tab[is.na(tab$SUPPORT_REF),]$SUPPORT_REF=0
      }
      
      tab=merge(tab,ies[,c("ID","START_MACIES","END_MACIES")],by.x="CLOSEST_FEATURE_ID",by.y="ID",all.x=T)
      
      tab=tab[ tab$START >= tab$START_MACIES & tab$START <= (tab$END_MACIES+2) & tab$END >= tab$START_MACIES & tab$END <= (tab$END_MACIES+2),]
      
      event_ids=c(event_ids,tab$ID)
      
      data[[prefix]] = tab
   }   
} 
event_ids=unique(event_ids)




# list all events
events=data.frame(ID=event_ids,NAME=NA,SEQ_ID=NA,START=NA,END=NA,BOUNDARY_RELATION=NA,IES_ID=NA,BOUNDED_BY_TA=NA,SEQ=NA)
rownames(events)=events$ID

for(f in 1:length(prefixes)) {
   prefix=prefixes[f]
   tab = data[[prefix]]
   rownames(tab)=tab$ID
   ids=intersect(events[is.na(events$SEQ_ID),]$ID,tab$ID)
   events[ids,]$SEQ_ID = tab[ids,]$SEQ_ID
   events[ids,]$START = tab[ids,]$START
   events[ids,]$END = tab[ids,]$END
   events[ids,]$BOUNDARY_RELATION = tab[ids,]$BOUNDARY_RELATION
   events[ids,]$IES_ID = tab[ids,]$CLOSEST_FEATURE_ID
   events[ids,]$BOUNDED_BY_TA = tab[ids,]$BOUNDED_BY_TA
   events[ids,]$SEQ = tab[ids,]$SEQUENCE
   #events[ids,]$SEQ = substr(tab[ids,]$SEQUENCE,0,13)
   
}

events$TYPE=ifelse(events$BOUNDARY_RELATION=="internal","Nested","Juxtaposed")

events[events$TYPE=="Nested",]$NAME=sub("MILORD","NestIES.PTET51.1",events[events$TYPE=="Nested",]$ID)
events[events$TYPE=="Juxtaposed",]$NAME=sub("MILORD","JuxtIES.PTET51.1",events[events$TYPE=="Juxtaposed",]$ID)

events$LENGTH=events$END-events$START+1
events$IES_LENGTH=ies[events$IES_ID,]$SIZE
events$DIFF_LENGTH=events$IES_LENGTH-events$LENGTH

summary(events)

# FILTER ON SUPPORT REF in FRAGMENTS :
results=data.frame(ID=rownames(events))
for(f in 1:length(prefixes)) {
   prefix=prefixes[f]
   tab = data[[prefix]]
   results=merge(results,tab[,c("ID","SUPPORT_REF")],by="ID",all.x=T)
   colnames(results)[ncol(results)]=labels[f]
}
rownames(results)=results$ID
results=results[,-1]
results[is.na(results)]=0
results=results[events$ID,]

results=results[apply(results[,frag_labels],1,max,na.rm=T) ==0 ,]
events=events[rownames(results),]

results[intersect(rownames(results),rownames(events[events$IES_ID=="IESPGM.PTET51.1.106.288995",])),]



# FILTER ON SUPPORT VARIANT :

results=data.frame(ID=rownames(events))
for(f in 1:length(prefixes)) {
   prefix=prefixes[f]
   tab = data[[prefix]]
   results=merge(results,tab[,c("ID","SUPPORT_VARIANT")],by="ID",all.x=T)
   colnames(results)[ncol(results)]=labels[f]
}
rownames(results)=results$ID
results=results[,-1]

results[is.na(results)]=0
results=results[events$ID,]

# absent from fragments
results=results[apply(results[,frag_labels],1,max,na.rm=T)==0 ,]

# present with 10 reads in at least two sample
results=results[apply(results[,alg_labels],1,function(x) { sum(x>=10) }) >=2,]


events=events[rownames(results),]

summary(events)


# eliminate overlapping events
gr<-makeGRangesFromDataFrame(events[,c("SEQ_ID","START","END")],seqnames.field="SEQ_ID")
values(gr)<-DataFrame(NAME =events$ID,BOUNDED_BY_TA=events$BOUNDED_BY_TA,FRAG_COUNT=apply(results[,frag_labels],1,sum,na.rm=T),COUNT=apply(results[,alg_labels],1,sum,na.rm=T))

no_overlapping_events_ids=mcols(gr[unique(as.numeric(names(table(queryHits(findOverlaps(gr,gr)))[table(queryHits(findOverlaps(gr,gr)))==1]))),])$NAME

best_overlapping_events_ids=c()
overlapping_events_ids=mcols(gr[unique(as.numeric(names(table(queryHits(findOverlaps(gr,gr)))[table(queryHits(findOverlaps(gr,gr)))!=1]))),])$NAME

gr.red<-reduce(makeGRangesFromDataFrame(events[overlapping_events_ids,c("SEQ_ID","START","END")],seqnames.field="SEQ_ID"), ignore.strand=TRUE)
for(i in 1:nrow(mcols(gr.red))) {
    over_ids=subjectHits(findOverlaps(gr.red[i,],gr))
    if(length(over_ids)>0) {
       gr.tab= as( gr[over_ids,], "data.frame")
       best_name=rownames(gr.tab[order(!gr.tab$BOUNDED_BY_TA,-gr.tab$COUNT),])[1]
       best_overlapping_events_ids=c(best_overlapping_events_ids,best_name)
    }

}

events=events[unique(c(no_overlapping_events_ids,best_overlapping_events_ids)),]
results=results[rownames(events),]
gr<-makeGRangesFromDataFrame(events[,c("SEQ_ID","START","END")],seqnames.field="SEQ_ID")
as.numeric(names(table(queryHits(findOverlaps(gr,gr)))[table(queryHits(findOverlaps(gr,gr)))!=1]))


summary(events)

table(events$TYPE)



gff=data.frame(SEQ_ID=events$SEQ_ID,SOURCE="MILORD",TYPE="internal_eliminated_sequence",START=events$START,END=events$END,SCORE=".",STRAND=".",PHASE=".",ATTRIBUTES=paste0("ID=",events$NAME,";Alias=",events$ID,";ies_id=",events$IES_ID,";boundary_relation=",events$BOUNDARY_RELATION))
gff=gff[order(gff$SEQ_ID,gff$START),]

write.table(events,paste0("internal_IES.",ref_pref,".tsv"),sep="\t",row.names=F,col.names=T,quote=F)

write.table(gff,paste0("internal_IES.",ref_pref,".gff3"),sep="\t",row.names=F,col.names=F,quote=F)



