
#Figure S3 
#peak = 0 :no peak, peak =1,2 and 3 = one peak  
# functions ############################################################

library(RColorBrewer)

makeTransparent<-function(someColor, alpha=100)
{
  newColor<-col2rgb(someColor)
  apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
    blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}




#ccol=c(rgb(0,128,255,max=255),"springgreen3", "purple","lightslateblue","midnightblue")
ccol=c("grey22","springgreen3", "purple","lightslateblue","midnightblue")

########################################################################
# PARAMETER USED FOR DATA_BILAN ANALYSIS ###############################

minread_peak=5
minread_peakb=5

minread_phas=15
minread_phasb=15
maxpos=101
pval_max=0.05
peakmin_order=1
cexa=1.2
cexa2=0.9
cexl=1.5
cexn=1.5


#read data_stat
data_bilan=read.table("../../tables_outrevised/03plastid/table_all_stat.txt", header=T)

#V2 change  peak presence/absence :0 or 1
data_bilan$peak2=data_bilan$peak
data_bilan$peak2[data_bilan$peak >=1]=1

#take counts nb of orf or genes per peak level

data_genes=data_bilan[data_bilan$type=="gene",]
data_orf=data_bilan[data_bilan$type=="orf",]

#genes =================================================================
testpg=table(data_genes$haplo, data_genes$peak2)
testpg #numbers
testpg[,2]/apply(testpg[,1:2],1,sum) #percent

table(data_genes$peak[data_genes$peak != 0])/sum(table(data_genes$peak[data_genes$peak != 0]))
table(data_orf$peak[data_orf$peak != 0])/sum(table(data_orf$peak[data_orf$peak != 0]))



#select all with peak !=0
select=data_genes[data_genes$peak2 !=0,]
length(unique(select$name))
length(unique(select$name))/length(unique(data_genes$name))

#phasing 
selph=select
selph$sig=rep(0, dim(selph)[1])
selph$sig[select$padj <= 0.05 & !is.na(select$padj)]=1
tphas=t(table(select$sig, select$haplo))

tphas #numbers 
tphas[,2]/apply(tphas,1,sum)*100 #percent

length(unique(selph$name[selph$sig=='1']))/length(unique(select$name))

#ORF =================================================================

testpg=table(data_orf$haplo, data_orf$peak2)
testpg #numbers
testpg[,2]/apply(testpg[,1:2],1,sum) #percent

select=data_orf[data_orf$peak2 !=0,]
length(unique(select$name))
length(unique(select$name))/length(unique(data_orf$name))
length(unique(data_orf$name[data_orf$peak2=="1"]))

#phasing 
selph=select
tphas=t(table(selph$sig, selph$haplo))
tphas #numbers
tphas[,2]/apply(tphas,1,sum)*100 #percent
length(unique(selph$name[selph$sig=='1']))/length(unique(select$name))
length(unique(data_orf$name[data_orf$peak2=="1" & data_orf$sig=="1"]))


#add ages for figure ######################################################################
data_rec=read.table("../../tables_outrevised/02conservation/conservation_table_spar.txt",colClasses=c(
"character","character","character","numeric", "character",rep("numeric",8)))
#select table with ORF present in at least one sequenced strain

select_cer=matrix(unlist(strsplit(data_rec$cons,"")),byrow=T,ncol=26)[,2]
select_A=matrix(unlist(strsplit(data_rec$cons,"")),byrow=T,ncol=26)[,7]
select_B=matrix(unlist(strsplit(data_rec$cons,"")),byrow=T,ncol=26)[,9]
select_C=matrix(unlist(strsplit(data_rec$cons,"")),byrow=T,ncol=26)[,22]
select_all=select_cer=="1" | select_A=="1" | select_B=="1"  | 
select_C=="1" 

#create a matrix of presence absence to simplify
data_rec$Cer=select_cer
data_rec$A=select_A
data_rec$B=select_B
data_rec$C=select_C

data_red=data_rec[data_rec$Cer=="1" | data_rec$A=="1" | data_rec$B=="1" | data_rec$C=="1" |
data_rec$group=="7_ancest",]

#add ORF ages

#Age 1 (A-BC div) : N2 = 1 and N1= 1 or 0
#Age 2 (B-C div) : N2=0 and N1=1
#Age 3 after div: N1=0 and N2=0

#OK classification according their age 

data_red$age=rep(0,dim(data_red)[1])
data_red$age[data_red$N1 ==0 & data_red$N2==0]=3
data_red$age[data_red$N1 ==1 & data_red$N2==0]=2
data_red$age[data_red$N2 ==1]=1

data_red=data.frame(name=data_red$orf, age=data_red$age)
data_bilan=merge(data_bilan,data_red, by.x="name", by.y="name", all.x=T)
data_bilan$age[data_bilan$type=="gene"]=0

eff_red=data_rec[,c(1,3)]
data_bilan=merge(data_bilan, eff_red, by.x="name", by.y="orf",all.x=T)

#add sizes 
meta=read.table("../../tables_outrevised/table_S2_metaexp_withRT.txt", header=T)
meta=meta[meta$type=="orf",]

datasize=data.frame(id=meta[,2],size_aa=meta[,5])
data_bilan$id=paste(data_bilan[,2], ";",data_bilan[,1], sep="")
data_bilan=merge(data_bilan, datasize,by.x="id", by.y="id",all.x=T)

#count per age 
#nb >60
length(unique(data_bilan$name[data_bilan$age=="1"]))
length(unique(data_bilan$name[data_bilan$age=="2"]))
length(unique(data_bilan$name[data_bilan$age=="3" & data_bilan$haplo=="SD01" ]))
length(unique(data_bilan$name[data_bilan$age=="3" & data_bilan$haplo=="SD06" ]))
length(unique(data_bilan$name[data_bilan$age=="3" & data_bilan$haplo=="SA03" ]))
#sig 
length(unique(data_bilan$name[data_bilan$age=="1" & data_bilan$sig=="1"]))
length(unique(data_bilan$name[data_bilan$age=="2" & data_bilan$sig=="1"]))
length(unique(data_bilan$name[data_bilan$age=="3" & data_bilan$haplo=="SD01"& data_bilan$sig=="1" ]))
length(unique(data_bilan$name[data_bilan$age=="3" & data_bilan$haplo=="SD06"& data_bilan$sig=="1" ]))
length(unique(data_bilan$name[data_bilan$age=="3" & data_bilan$haplo=="SA03" & data_bilan$sig=="1"]))


######################################################################
#FIGURES 
######################################################################
#palette choice
colp=brewer.pal(4, "YlGnBu")
lettersize=2

#=====================================================================
#hist and barplot per haplo now
#=====================================================================

chaplo=unique(data_bilan$haplo)

for (i in 2:(length(chaplo)-1)){
	haploi=chaplo[i]
	
	
    tiff(filename =paste("Supplementary_Fig/S2.FigS2_",haploi,".tiff",sep=""), width = 6300, height = 7000, 
	units = "px", res=645)
		
	#svg(filename =paste("Figures_manuscript/3.Fig2_",haploi,".svg",sep=""), width = 14, height = 15, pointsize=16)

matlab=c(1,2,2,3,3,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,11,12,12,12)
	zones <- matrix(matlab, ncol = 6, byrow = TRUE)
	layout(zones, widths=c(3,1,2,2,1,3), heights = c(5,2,6,4))
	#layout.show(n=6)
	
	
	data_haplo=data_bilan[data_bilan$haplo==haploi,]
	
	data_genes=data_haplo[data_haplo$type=="gene",]
	data_orf=data_haplo[data_haplo$type=="orf",]

	
	#PEAK detection 
	#=====================================================================
	par(mar=c(6,5,3,0),cex.axis=cexa,cex.lab=cexl)
	#histogramm number of reads around start codon
	h <- hist(log2(data_genes$nbread_start), ylim=c(0,1500), breaks=50, 
	col=makeTransparent(ccol[1], 255), border="white", main="", 
	xlab="") 
	hist(log2(data_orf$nbread_start),add=T, breaks=20, 
	col=makeTransparent(ccol[3], 150), border="white") 
	
	#mtext("Number of initiating \nreads (log2)", side=1, line=4)
	mtext("Number of initiating", side=1, line=3, cex=cexa2)
	mtext(expression(paste("reads (" ,log[2],")",sep="")), side=1, line=5,cex=cexa2)
	title("A",cex.main=lettersize,adj=0, line=1,outer=F)

	#
	#barplot peak with genes
	#=====================================================================
	colp=brewer.pal(5, "YlGnBu")
	#type 1
	tpeak=table(data_haplo$type,data_haplo$peak )
	colp=brewer.pal(5, "YlGnBu")
	barplot(t(tpeak), col=colp[2:5], border="white", 
	names.arg=c("Gen", "iORF"),
	, ylab="Number of genes or iORFs", xlim=c(0,4),ylim=c(0,max(apply(tpeak,2,sum)+500)),cex.lab=cexl,cex.names=cexn)
	
	legend("topright", c("p0","p1", "p2","p3"), fill=colp[2:5],cex=cexl)
title("B",cex.main=lettersize,adj=0, line=1,outer=F)


	
	#Phasing test 
	#=====================================================================
	#select with peak 
	datap=data_haplo[data_haplo$peak2 !=0,]
	datap$sig=rep(0,dim(datap)[1])
	datap$sig[datap$padj <= 0.05 & !is.na(datap$padj)]=1
	datasig=datap[datap$padj <= 0.05 & !is.na(datap$padj),]
	
	#histogramm 
	#histogramm number of reads around start codon
	datap_gene=datap[datap$type=="gene",]
	datap_orf=datap[datap$type=="orf",]
	h <- hist(log2(data_genes$nbread_phas),  ylim=c(0,2000), breaks=40, 
	col=makeTransparent(ccol[1], 255), border="white", main="", 
	xlab="") 
	hist(log2(data_orf$nbread_phas),add=T, breaks=20, 
	col=makeTransparent(ccol[3], 170), border="white", cex=cexa) 
	title("C",cex.main=lettersize,adj=0, line=1,outer=F)

	#mtext("Number of reads in \nthe first 50 bp (log2)", side=1, line=4)
	mtext("Number of reads in", side=1, line=3, cex=cexa2)
	mtext(expression(paste("the first 50 nt (" ,log[2],")",sep="")), side=1, line=5, cex=cexa2)

	#barplot phasing
	#par(mar=c(6,5,3,0),cex.axis=cexi,cex.lab=cexl, cex.name=cexn)
	par(mar=c(6,5,3,0))
		
	#type 1 
	tphas=table(datap$type,datap$sig )
	barplot(t(tphas), col=c(colp[2],colp[5]), border="white", 
	names.arg=c("Gen", "iORF"), ylab="Number of genes or iORFs", xlim=c(0,4),
    ylim=c(0,max(apply(tphas,2,sum))+1000), cex.lab=cexl,cex.names=cexn)
	
    
    legend("topright", c("No phas","Phas"), fill=c(colp[2],colp[5]), cex=cexl)

	title("D",cex.main=lettersize,adj=0, line=1,outer=F)




#Heatmap with separate expression levels 
########################################################################

#colh=brewer.pal(9, "Blues")
colh=brewer.pal(9, "Greys")

#min exp to separate high and low expressed genes 
min_exp=80

#chaplo=unique(data_bilan$haplo)
#pvalmax=0.05
cname_haplo=c("SpC","SpA","SpB","YPS128")
	
	data_haplo=data_bilan[data_bilan$haplo==haploi,]
	
	select_sig=data_haplo$sig =="1"

	count=data.frame(matrix(as.numeric(unlist(strsplit(as.character(data_haplo$counts), 
	"-"))), ncol=maxpos, byrow=TRUE))
	
	#row.names(count)=data_haplo$name
	
	#plot profiles=====================================================
	#genes highly expressed (> 80 phasing reads)
	par(mar=c(0,6,3,2))
	sel_gene_h=count[select_sig & data_haplo$type=="gene" &
	 data_haplo$nbread_phas > min_exp,]
	sumg=apply(sel_gene_h,2,sum)[49:maxpos]
	
	plot(-2:50,sumg/dim(sel_gene_h)[1], type="l", col=ccol[1], 
		ylab="Mean read counts", xaxt='n',ann=FALSE, lwd=2, 
		main=paste(haplo," Genes HE sig n =", dim(sel_gene_h)[1]), 
		xlim=c(-2,48.5))
	
	mtext("Mean \nread counts", side=2, line=3)
	title("E",cex.main=lettersize,adj=0, line=1,outer=F)


	#genes lowly expressed (<= 50 phasing reads)
	par(mar=c(0,6,2,2))
	
	sel_gene_l=count[select_sig & data_haplo$type=="gene" & 
	  data_haplo$nbread_phas <= min_exp,]

	sumg=apply(sel_gene_l,2,sum)[49:maxpos]
	
	plot(-2:50,sumg/dim(sel_gene_l)[1], type="l", col=ccol[1], 
		ylab="Mean read counts", xaxt='n',ann=FALSE, lwd=2, 
		main=paste(haplo," Genes LE sig n =", dim(sel_gene_l)[1]),
		xlim=c(-2,48.5))
	
	#iORFs
	
	sel_orf=count[select_sig & data_haplo$type=="orf" ,]
	sumo=apply(sel_orf,2,sum)[49:maxpos]
	plot(-2:50,sumo/dim(sel_orf)[1], type="l", col=ccol[3], 
		ylab="Mean read counts)", xaxt='n',ann=FALSE, lwd=2, 
		main=paste(italic(haplo)," iORFs sig n =", dim(sel_orf)[1]),
		xlim=c(-2,48.5))
	
	
	#genes##############################################################
	#Genes HE 
	
	par(mar=c(7,6,0,2))
	
	
	count_orf=count[select_sig & data_haplo$type=="gene" & 
	 data_haplo$nbread_phas > min_exp,]
	
	#count_orf=count_orf[1:200,]
	ctot=apply(count_orf,1,sum)
	cnorm=NULL
	for (j in 1:dim(count_orf)[1] ){
		counti=count_orf[j,48:maxpos]
		normi=counti/sum(counti)
		cnorm=rbind(cnorm, normi)
	}
	cnorm=cnorm[order(ctot),]
	image(as.matrix(t(log2(cnorm))), col=colh, axes=F)
	
	
axis(1, at=seq(3,54,by=3)/53, labels=F)
	
	
	mtext(seq(0,50,by=6), side = 1, line = 1, outer = F, 
	at = seq(3,54,by=6)/53, cex=0.8)


	mtext("Normalized read counts ", side=2, line=4)
	mtext("per gene or tORF", side=2, line=2)
	
	mtext("Position relative to start codon\n (in nt)", side=1, line=4)
	
	
	mtext(paste(cname_haplo[i], "Genes HE n =", dim(cnorm)[1]), side=1, line=6)
	
	#Genes LE 
par(mar=c(7,6,0,2))
		
	count_orf=count[select_sig & data_haplo$type=="gene" 
	& data_haplo$nbread_phas <= min_exp,]
	
	#count_orf=count_orf[1:200,]
	ctot=apply(count_orf,1,sum)
	cnorm=NULL
	for (j in 1:dim(count_orf)[1] ){
		counti=count_orf[j,48:maxpos]
		normi=counti/sum(counti)
		cnorm=rbind(cnorm, normi)
	}
	cnorm=cnorm[order(ctot),]
	image(as.matrix(t(log2(cnorm))), col=colh, axes=F)
	
	
axis(1, at=seq(3,54,by=3)/53, labels=F)
	
	
	mtext(seq(0,50,by=6), side = 1, line = 1, outer = F, 
	at = seq(3,54,by=6)/53, cex=0.8)


	
	mtext("Position relative to start codon\n (in nt)", side=1, line=4)
	
	
	mtext(paste(cname_haplo[i], "Genes LE n =", dim(cnorm)[1]), side=1, line=6)
	
	
	#ORFs##############################################################
	
	count_orf=count[select_sig & data_haplo$type=="orf" ,]

	####test
	#count_orf=count_orf[1:200,]
	ctot=apply(count_orf,1,sum)
	cnorm=NULL
	for (j in 1:dim(count_orf)[1] ){
		counti=count_orf[j,48:maxpos]
		normi=counti/sum(counti)
		cnorm=rbind(cnorm, normi)
	}
	cnorm=cnorm[order(ctot),]
	image(as.matrix(t(log2(cnorm))), col=colh, axes=F)
	
	
axis(1, at=seq(3,54,by=3)/53, labels=F)
	
	
	mtext(seq(0,50,by=6), side = 1, line = 1, outer = F, 
	at = seq(3,54,by=6)/53, cex=0.8)


	mtext("Position relative to start codon\n (in nt)", side=1, line=4)
	
	
	mtext(paste(cname_haplo[i]," tORFs n = ",dim(cnorm)[1]), side=1, line=6)
	
	#dev.off()





	dev.off()


}


