#script to filter orf based on gap and id 


#percent gap max 
gapmax=0.20
nbhaplo=28

#percent id min on sid
min_idsid=0.50
#min percent orf extended
min_orf=0.60
#REad masked table

dataorf=read.table("../../03synt_intergenic_orf/02_ORF_tables/01masking_steps/5.table_orf_allfilt.gff")
colnames(dataorf)=c("seqid", "source", "type", "orf_start","orf_stop","score", 
"strand", "phase","infos")

#read sid infos
statsid=read.table("../../02synt_intergenic_fasta/04_mask_RM_db/03_masked_fasta/align_sid_stat.txt")
colnames(statsid)=c("sid", "maxsid","minsid")

statorf=read.table("../../03synt_intergenic_orf/02_ORF_extalign/stat_alignORF_300.txt")
colnames(statorf)=c("orf", "maxorf","minorf")

tinfos=matrix(unlist(strsplit(as.character(dataorf$infos), ";")), ncol=5, byrow=T)
#colnames(tinfos)=c("orf_id", "SID", "haplo", "statut", "size", "nbgap" )

dataorf$size=dataorf$orf_stop-dataorf$orf_start+1
dataorf$orf_id=sub("id_orf=","", tinfos[,1])
dataorf$haplo=sub("haplo=","", tinfos[,3])
dataorf$statut=sub("stat=","", tinfos[,4])
dataorf$nbgap=sub("nbgap=","", tinfos[,5])

dataorf$sid=matrix(unlist(strsplit(as.character(dataorf$seqid),";")), ncol=2,byrow=T)[,1]


#add sid and orf identity  infos 
#=======================================================================

dataorf=merge(dataorf, statsid, by.x="sid",by.y="sid")
dataorf=merge(dataorf, statorf, by.x="orf_id",by.y="orf")

hist(dataorf$minsid)
hist(dataorf$minorf)


#FILTRES

#remove N3 recons because non informative
dataorf=dataorf[dataorf$haplo !="N3",]

#on regarde le nombre de gap par rapport à la taille pour eviter large indels
#on retire ceux avec + de 20 % par rapport à la taille orf
# et ceux localisés ds une region avec un faible alignement (< 60%)
dataorf$perc=as.numeric(dataorf$nbgap)/as.numeric(dataorf$size)
dataf1=dataorf[dataorf$perc <= gapmax & 
dataorf$minorf >=min_orf & dataorf$minsid >=min_idsid,]

#Comptage du nomgre de familles , on garde celles tjr = nbhaplo 
test_count=data.frame(table(dataf1$orf_id))
colnames(test_count)=c("id","count")
dataf1=merge(dataf1, test_count, by.x="orf_id", by.y="id")

#dataf2: datafiltrées final!
dataf2=dataf1[dataf1$count==nbhaplo,]
dataf2=dataf2[,c(1,3:15,20,21)]
finalgff=dataf2[,2:10]


length(unique(dataf2$orf_id))

#Ecriture du tableau final filtre


dir_cons="../../03synt_intergenic_orf/02_ORF_tables/table_orf_filtcons.txt"
write.table(dataf2, dir_cons, quote = F, sep="\t", row.names =F, col.names=F)

dir_gff="../../03synt_intergenic_orf/02_ORF_tables/table_orf_filtcons.gff"
write.table(finalgff, dir_gff, quote = F, sep="\t", row.names =F, col.names=F)




