
conserved=read.table("List_Highly_conserved_IESs.txt",sep="\t",h=T,stringsAsFactors=F)
conserved=merge(conserved,read.table("IES_paramecia_description.txt",sep="\t",h=T,stringsAsFactors=F)[,c("IES_ID","scaffold","startLocs","endLocs")],by="IES_ID",all.x=T)

conserved=conserved[conserved$Species=="pte",]
conserved$UNIQUENAME=paste0(conserved$scaffold,":",conserved$startLocs,"-",conserved$endLocs)


ies=read.table("/data/PARAMECIUM/GENOMIC/tetraurelia/micronucleus/IES/51/internal_eliminated_sequence_PGM_IES51.pt_51.gff3.tab",h=T,stringsAsFactors=F)
ies$UNIQUENAME =paste0(ies$SEQ_ID,":",ies$START,"-",ies$END)

t=merge(conserved,ies[,c("ID","UNIQUENAME")],by="UNIQUENAME",all.x=T)


closestIES<-function(x) {
    startLocs=strsplit(as.character(x["startLocs"]),split=",")[[1]]
    endLocs=strsplit(as.character(x["endLocs"]),split=",")[[1]]
    seq_id=x["scaffold"]
    id=NA
    for(n in 1:length(startLocs)) {
        u=paste0(seq_id,":",startLocs[n],"-",endLocs[n])
        if(u %in% ies$UNIQUENAME) {
            id = ies[ies$UNIQUENAME=="scaffold51_9:2325-2326",]$ID
        }
    }
    id
    
    
}

t[is.na(t$ID),"ID"] = apply(t[is.na(t$ID),],1,closestIES)

#missing
t[is.na(t$ID),]

write.table(t[!is.na(t$ID),c("ID","Annotation","IES_family")],"../ptet_HighlyConserved_IESs.tsv",sep="\t",quote=F,row.names=F)

