
# Script to assign each ORF to a conservation group 
#haplos : 24 HC strains + Y128 + S288C and 2 ancestral reconstructions
nbhaplo=28

#read data 
#filtered orf
#====================================================================
datadir="../../03synt_intergenic_orf/02_ORF_tables/table_orf_filtcons.txt"
dataorf=read.table(datadir)

#name columns
colnames(dataorf)=c("orf_id","chrom","tool", "feature", "start","stop",
"score","strand","x","infos","size","haplo","statut","nbgap",
"gapcov", "nbhaplo")

#corres with lineages and positions in phylogeny
corresdir="infos_lib.txt"
corres=read.table(corresdir, header=T)

#add id with number to orf data
#====================================================================

corres$id=paste(corres$order,"_",corres$lib, sep="")
dataorf=merge(dataorf, corres, by.x="haplo", by.y="lib")

#selection des orf presents statut=1 et table presence absence
data1=dataorf[dataorf$statut==1,]
tcons=table(data1$orf_id, data1$order)
colnames(tcons)=corres$id

#Conservation level
#=======================================================================
#Conserved
#species specific
#lineage specific
#polymorphic

cons=tcons[,1]
for (i in 2:26){
	cons=paste(cons, tcons[,i], sep="")
}


datac=cbind(orf=rownames(tcons), cons)
rownames(datac)=rownames(tcons)
count=data.frame(apply(tcons, 1, sum))
datac=data.frame(datac, count)
colnames(datac)[3]="count"




datac$group=rep("6_polymorphic", dim(datac)[1])
datac$col=rep(6, dim(datac)[1])

#fixed lineage
#polymorphic
datac$Cer=apply(tcons[,1:2], 1, sum)
datac$A=apply(tcons[,3:7], 1, sum)
datac$B=apply(tcons[,8:17], 1, sum)
datac$C=apply(tcons[,18:26], 1, sum)
datac$Anc=apply(tcons[,27:28], 1, sum)

datac$N1=tcons[,27]
datac$N2=tcons[,28]

testCer=datac$Cer==2 | datac$Cer==0
testA=datac$A==5 | datac$A==0
testB=datac$B==10 | datac$B==0
testC=datac$C==9 | datac$C==0

datac$group[testCer & testA & testB & testC]="4_divergent"
datac$col[testCer & testA & testB & testC]=4

#conserved
datac$group[datac$count==nbhaplo]="1_conserved"
datac$col[datac$count==nbhaplo]=1


#Species specific
datac$group[datac$Cer==2 & datac$A==0 & datac$B==0 & datac$C==0]="2_SpCer"
datac$group[datac$Cer==0 & datac$A==5 & datac$B==10 & datac$C==9]="3_pPar"
datac$col[datac$group=="2_SpCer"]=2
datac$col[datac$group=="3_pPar"]=3


#Only in ancestral sequences 
datac$group[datac$Cer==0 & datac$A==0 & datac$B==0 & datac$C==0 & datac$Anc !=0]="7_ancest"


#Lineage specific

datac$group[datac$cons=="00111110000000000000000000" | 
datac$cons=="00000001111111111000000000" | 
datac$cons== "00000000000000000111111111"]="5_SpeGroup"


#Lineage specific
datac$col[datac$group=="5_SpeGroup"]=5

#Remove Cer specific orfs 
data_spar1=datac[datac$group != "2_SpCer" & 
datac$cons!="10000000000000000000000000" & 
datac$cons!="01000000000000000000000000",]

#Remove N3 specific (not informative because Cer outgroup)
test=data_spar1$cons=="00000000000000000000000000" & data_spar1$Anc ==0
data_spar=data_spar1[test==F,]

#rename columns N1 and N2 to fit with figure
#colnames(data_spar)[11:12]=c("N2","N1")
# remove to keep old node names everywhere


#counts 
#######################################################################
dim(data_spar) #total (without cer specific)

#number of ORF 
table_count=table(data_spar$group)
round(table_count/sum(table_count)*100, digit=2)

#write conservation table one line per orf family
write.table(data_spar, "../tables_out/02conservation/conservation_table_spar.txt", col.names=T, 
sep="\t", quote=F, row.names=T)

write.table(data_spar, "../../03synt_intergenic_orf/03_ORF_cons_spar/conservation_table_spar.txt", col.names=T, 
sep="\t", quote=F, row.names=T)

#######################################################################
#Create a table of  final ORFs with one line per haplo 

finalorf=data_spar$orf

#select final orf set (without cerevisiae specific)
data_final=dataorf[dataorf$orf_id %in% finalorf,]

#check orf number
length(unique(data_final$orf_id))

#add conservation group
consgroup=data.frame(orf_id=data_spar$orf, group=data_spar$group)
data_finalc=merge(data_final, consgroup, by.x="orf_id", by.y="orf_id")


#write orf table one line per haplo
write.table(data_finalc, "../tables_out/02conservation/orf_table_spar.txt", col.names=T, 
sep="\t", quote=F, row.names=T)
write.table(data_finalc, "../../03synt_intergenic_orf/03_ORF_cons_spar/orf_table_spar.txt", col.names=T, 
sep="\t", quote=F, row.names=T)

