

source("../../headers.R")
source("../../functions.R")



img_dir=paste0("images/")
base_result_dir=paste0("results/")

dir.create(img_dir,showWarnings=F,  recursive=T)
dir.create(base_result_dir,showWarnings=F,  recursive=T)


basegenome="ptetraurelia_mic2"
ref_pref="mic2"



seqlength=read.table(paste0("../../data/",basegenome,".seqlength"),h=F)
colnames(seqlength)=c("SEQ_ID","SEQ_LENGTH")
rownames(seqlength)=seqlength$ID
seq_ids=seqlength[seqlength$SEQ_LENGTH>1000,]$SEQ_ID




compartments=read.table("/data/PARAMECIUM/COVERAGE/tetraurelia/ptetraurelia_mic2/WINDOWS/ptetraurelia_mic2.windows1000_MIC_MAC_compartments.tab",h=T,stringsAsFactors = FALSE)
compartments=compartments[compartments$COMPARTMENT!="NONE",]
colnames(compartments)[colnames(compartments)=="SEQ_ID"]="chr"
compartments = compartments[which(is.element(compartments$chr,seq_ids)),]
compartments=compartments[order(compartments$chr,compartments$START),]


res_dir="results/"
data=list()
event_ids=c()
for(prefix in prefixes) {
   file=paste0(res_dir,"/",prefix,"/ParTIES/MILORD/MILORD.tab")
   print(file)
   if(!file.exists(file)) {
      print(paste("No file",file))
   } else {
      tab=read.delim(file,h=T,sep="\t",stringsAsFactors=F)
      tab=tab[!tab$IS_OVERLAPPING,]
      tab=tab[tab$DELETION_TYPE=="INTRA_CHR",]
      tab=tab[tab$IS_COHERENT,]

      tab=tab[tab$BOUNDED_BY_TA,]
      tab = tab[tab$SIZE>=20,]
      tab = tab[which(is.element(tab$SEQ_ID,seq_ids)),]
      if(sum(is.na(tab$SUPPORT_REF))!=0) {
        tab[is.na(tab$SUPPORT_REF),]$SUPPORT_REF=0
      }      
      event_ids=c(event_ids,tab$ID)
      
      data[[prefix]] = tab
   }   
} 
event_ids=unique(event_ids)



# list all events
events=data.frame(ID=event_ids,SEQ_ID=NA,START=NA,END=NA,SEQ=NA)
rownames(events)=events$ID

for(f in 1:length(prefixes)) {
   prefix=prefixes[f]
   tab = data[[prefix]]
   rownames(tab)=tab$ID
   ids=intersect(events[is.na(events$SEQ_ID),]$ID,tab$ID)
   events[ids,]$SEQ_ID = tab[ids,]$SEQ_ID
   events[ids,]$START = tab[ids,]$START
   events[ids,]$END = tab[ids,]$END
   events[ids,]$SEQ = tab[ids,]$SEQUENCE
   #events[ids,]$BOUNDARY_SEQ = substr(events[ids,]$SEQ,0,13)
   
}

events$LENGTH=events$END-events$START+1

events$TYPE="On MIC-limited"

events$NAME=sub("MILORD","LierIES.PTET51.1",events$ID)


summary(events)


# FILTER ON SUPPORT REF in FRAGMENTS :
results=data.frame(ID=rownames(events))
for(f in 1:length(prefixes)) {
   prefix=prefixes[f]
   tab = data[[prefix]]
   results=merge(results,tab[,c("ID","SUPPORT_REF")],by="ID",all.x=T)
   colnames(results)[ncol(results)]=labels[f]
}
rownames(results)=results$ID
results=results[,-1]
results[is.na(results)]=0
results=results[events$ID,]

results=results[apply(results[,frag_labels],1,max,na.rm=T)==0 ,]
events=events[rownames(results),]





# FILTER ON SUPPORT VARIANT :


results=data.frame(ID=rownames(events))
for(f in 1:length(prefixes)) {
   prefix=prefixes[f]
   tab = data[[prefix]]
   results=merge(results,tab[,c("ID","SUPPORT_VARIANT")],by="ID",all.x=T)
   colnames(results)[ncol(results)]=labels[f]
}
rownames(results)=results$ID
results=results[,-1]

results[is.na(results)]=0
results=results[events$ID,]



# absent from fragments
results=results[apply(results[,frag_labels],1,max,na.rm=T)==0 ,]

# present with 10 reads in at least two sample
results=results[apply(results[,alg_labels],1,function(x) { sum(x>=10) }) >=2,]


events=events[rownames(results),]

summary(events)





# eliminate overlapping events
miclimited<-makeGRangesFromDataFrame(compartments[compartments$COMPARTMENT=="MIC-limited",c("chr","START","END")])
seqlevels(miclimited)=seq_ids               


gr<-makeGRangesFromDataFrame(events[,c("SEQ_ID","START","END")],seqnames.field="SEQ_ID")
seqlevels(gr)=seq_ids  

values(gr)<-DataFrame(NAME =events$ID,COUNT= rowSums(results[,alg_labels]))

# only events on MIC-limited regions
gr=gr[unique(queryHits(findOverlaps(gr,miclimited))),]


# remove redundancy
no_overlapping_events_ids=mcols(gr[unique(as.numeric(names(table(queryHits(findOverlaps(gr,gr)))[table(queryHits(findOverlaps(gr,gr)))==1]))),])$NAME
best_overlapping_events_ids=c()
overlapping_events_ids=mcols(gr[unique(as.numeric(names(table(queryHits(findOverlaps(gr,gr)))[table(queryHits(findOverlaps(gr,gr)))!=1]))),])$NAME
gr.red<-reduce(makeGRangesFromDataFrame(events[overlapping_events_ids,c("SEQ_ID","START","END")],seqnames.field="SEQ_ID"))
for(i in 1:nrow(mcols(gr.red))) {
    over_ids=subjectHits(findOverlaps(gr.red[i,],gr))
    if(length(over_ids)>0) {
       gr.tab= as( gr[over_ids,], "data.frame")
       best_name=rownames(gr.tab[order(gr.tab$COUNT,decreasing=T),])[1]
       best_overlapping_events_ids=c(best_overlapping_events_ids,best_name)
    }

}
events=events[unique(c(no_overlapping_events_ids,best_overlapping_events_ids)),]
gr<-makeGRangesFromDataFrame(events[,c("SEQ_ID","START","END")],seqnames.field="SEQ_ID")
as.numeric(names(table(queryHits(findOverlaps(gr,gr)))[table(queryHits(findOverlaps(gr,gr)))!=1]))

results=results[events$ID,]

summary(events)





gff=data.frame(SEQ_ID=events$SEQ_ID,SOURCE="MILORD",TYPE="internal_eliminated_sequence",START=events$START,END=events$END,SCORE=".",STRAND=".",PHASE=".",ATTRIBUTES=paste0("ID=",events$NAME,";Alias=",events$ID))
gff=gff[order(gff$SEQ_ID,gff$START),]

write.table(events,paste0("IES_on_Imprecisely_Eliminated_Regions.",ref_pref,".tsv"),sep="\t",row.names=F,col.names=T,quote=F)

write.table(gff,paste0("IES_on_Imprecisely_Eliminated_Regions.",ref_pref,".gff3"),sep="\t",row.names=F,col.names=F,quote=F)



