#script to detect initiation peak and phasing from 
#RPF read density obtained with plastid 

library(RColorBrewer)

file_in="../../13_TE_Cer/03_annotations/table_density_all.txt"
file_out="../../13_TE_Cer/03_annotations/table_all_stat.txt"
#haplo="SA03"
minread_peak=5
minread_peakb=5

minread_phas=15
minread_phasb=15
maxpos=100
pval_max=0.05
peakmin_order=1
ccol=c("grey22","springgreen3", "purple","lightslateblue","midnightblue")


#Prepare data
#=====================================================================

data_all=read.table(file_in, header=T)


data_bilan=NULL
chaplo=unique(data_all$haplo)

for (i in 1:length(chaplo)){
	#Par haplo 
	#=====================================================================

	haplo=chaplo[i]
	data=data_all[data_all$haplo==haplo,]
	
	
	#matrix with count aligned reads
	count=data.frame(matrix(as.numeric(unlist(strsplit(as.character(data$count), 
	"-"))), ncol=100, byrow=TRUE))
	
	rownames(count)=data$name
	#nbread counts
	data$nbread_start=apply(count[,50:53], 1, sum)
	data$nbread_phas=apply(count[,54:maxpos], 1, sum)
	
	#create vector with values of the highest  peak per orf or gene 
	topcount=data$max1
	
	if(haplo=="SD06"){
		minread_peaki=minread_peakb
	}
	else{
		minread_peaki=minread_peak
	}

	#peak detection 1=peak 2 = highest peak
	#select 0 : unprecise peak
	select0=count$X51> count$X53 & count$X51 > count$X49 & 
	data$nbread_start > minread_peaki
	
	#select 1 precise peak 
	select1=count$X51> count$X52 & count$X51 > count$X50 & 
	data$nbread_start > minread_peaki
	
	#select2 highest peak
	select2=count$X51 >=data$max1
	
	data$peak=rep(0, dim(data)[1])
	data$peak[select0]=1
	data$peak[select1]=2
	data$peak[select1 & select2]=3
	
	#table nb peak 
	table(data$peak, data$type)

	#phasing test
	#=====================================================================
	#phases coord (without the first start codon)
	p1=seq(54,maxpos, by=3)
	p2=seq(55,maxpos, by=3)
	p3=seq(56,maxpos, by=3)

	data$p1=apply(count[,p1], 1,sum)
	data$p2=apply(count[,p2], 1,sum)
	data$p3=apply(count[,p3], 1,sum)
	
	#test binomial
	data$pval=rep(NA,dim(data)[1])
	
	#change min read number for SD06 because less reads
	if(haplo=="SD06"){
		minread_phasi=minread_phasb
	}else{
		minread_phasi=minread_phas
	}

	for (k in 1:dim(data)[1]){
		nbi=data$nbread_phas[k]
		pi=data$p1[k]
		if (nbi > minread_phasi & pi > 0) { 
			testi=binom.test(pi, nbi, 1/3,alternative ="greater")
			data$pval[k]=testi$p.value
		
		}
	}
	nb_test=length(data$pval[data$nbread_phas >= minread_phasi & data$p1 > 0])
	data$padj=p.adjust(data$pval, method="fdr", n=nb_test)
	
	data_red=data.frame(haplo=data$haplo ,cons=data$cons, name=data$name,
	type=data$type, nbread_start=data$nbread_start, nbread_phas=data$nbread_phas, 
	p1=data$p1,p2=data$p2,p3=data$p3, peak=data$peak, pval=data$pval, 
	padj=data$padj, counts=data$counts)

	data_bilan=rbind(data_bilan, data_red)

}

#add significance column = 
#peak != 0 
#nbread start >5
#nbread phas > 15
# significant padj value <= 0.05 
select_sig=data_bilan$peak != "0" & data_bilan$padj <= pval_max & 
!is.na(data_bilan$padj)

data_bilan$sig=rep(0, dim(data_bilan)[1])
data_bilan$sig[select_sig]=1

table(data_bilan$sig, data_bilan$type)



# save stat table
#=====================================================================
write.table(data_bilan, file_out, sep="\t", quote=F)



#V2 change  peak presence/absence :0 or 1
data_bilan$peak2=data_bilan$peak
data_bilan$peak2[data_bilan$peak >=1]=1

#take counts nb of orf or genes per peak level

data_genes=data_bilan[data_bilan$type=="gene",]
data_orf=data_bilan[data_bilan$type=="orf",]

#genes =================================================================
testpg=table(data_genes$haplo, data_genes$peak2)
testpg #numbers
testpg[,2]/apply(testpg[,1:2],1,sum) #percent

#select all with peak !=0
select=data_genes[data_genes$peak2 !=0,]
length(unique(select$name))
length(unique(select$name))/length(unique(data_genes$name))

#phasing 
selph=select
selph$sig=rep(0, dim(selph)[1])
selph$sig[select$padj <= 0.05 & !is.na(select$padj)]=1
tphas=t(table(select$sig, select$haplo))

tphas #numbers 
tphas[,2]/apply(tphas,1,sum)*100 #percent

length(unique(selph$name[selph$sig=='1']))/length(unique(select$name))

#ORF =================================================================

testpg=table(data_orf$haplo, data_orf$peak2)
testpg #numbers
testpg[,2]/apply(testpg[,1:2],1,sum) #percent

select=data_orf[data_orf$peak2 !=0,]
length(unique(select$name))
length(unique(select$name))/length(unique(data_orf$name))

#phasing 
selph=select
tphas=t(table(selph$sig, selph$haplo))
tphas #numbers
tphas[,2]/apply(tphas,1,sum)*100 #percent
length(unique(selph$name[selph$sig=='1']))/length(unique(select$name))

######################################################################
#FIGURES 
######################################################################
#palette choice
colp=brewer.pal(4, "YlGnBu")


#Heatmap with separate expression levels 
########################################################################

#colh=brewer.pal(9, "Blues")
colh=brewer.pal(9, "Greys")



	haplo="S288C"
	
	tiff(paste("../figures/03plastid/Fig3_exp_",as.character(haplo),".tiff", sep=""),
		width = 1800, height = 1600, 
		units = "px", res=300)
	
	zones <- matrix(1:4, ncol = 2, byrow = TRUE)
	layout(zones, widths=c(4,4), heights = c(5,8))
	
	data_haplo=data_bilan[data_bilan$haplo==haplo,]
	
	select_sig=data_haplo$sig =="1"

	count=data.frame(matrix(as.numeric(unlist(strsplit(as.character(data_haplo$counts), 
	"-"))), ncol=100, byrow=TRUE))
	
	row.names(count)=data_haplo$name
	
	#plot profiles=====================================================
	#genes highly expressed (> 80 phasing reads)
	par(mar=c(0,5,2,2))
	sel_gene_h=count[select_sig & data_haplo$type=="gene" ,]
	sumg=apply(sel_gene_h,2,sum)[49:100]
	
	plot(-2:49,sumg/dim(sel_gene_h)[1], type="l", col=ccol[1], 
		ylab="Mean read counts", xaxt='n',ann=FALSE, lwd=2, 
		main=paste(haplo," Genes  sig n =", dim(sel_gene_h)[1]), 
		xlim=c(-2,48))
	
	mtext("Mean read counts", side=2, line=3,cex=0.8)
	

		
	#iORFs
	
	sel_orf=count[select_sig & data_haplo$type=="orf" ,]
	sumo=apply(sel_orf,2,sum)[49:100]
	plot(-2:49,sumo/dim(sel_orf)[1], type="l", col=ccol[3], 
		ylab="Mean read counts)", xaxt='n',ann=FALSE, lwd=2, 
		main=paste(haplo," iORFs sig n =", dim(sel_orf)[1]),
		xlim=c(-2,48))
	
	
	#genes##############################################################
	#Genes HE 
	
	par(mar=c(7,5,0,2))
	
	
	count_orf=count[select_sig & data_haplo$type=="gene" ,]
	
	#count_orf=count_orf[1:200,]
	ctot=apply(count_orf,1,sum)
	cnorm=NULL
	for (j in 1:dim(count_orf)[1] ){
		counti=count_orf[j,49:100]
		normi=counti/sum(counti)
		cnorm=rbind(cnorm, normi)
	}
	cnorm=cnorm[order(ctot),]
	image(as.matrix(t(log2(cnorm))), col=colh, axes=F)
	axis(1, at=seq(2,50,by=3)/51, labels=F)
	
	mtext(seq(1,50,by=6), side = 1, line = 1, outer = F, 
	at = seq(2,50,by=6)/51, cex=0.8)
	mtext("Normalized read counts \nper gene or iORF", side=2, line=2, cex=0.8)
	mtext("Position relative to start codon\n (in nucleotides)", side=1, line=4, cex=0.8)
	
	
	mtext(paste(haploi, "Genes n =", dim(cnorm)[1]), side=1, line=6, cex=0.8)
	
	
	
	#ORFs##############################################################
	
	count_orf=count[select_sig & data_haplo$type=="orf" ,]

	####test
	#count_orf=count_orf[1:200,]
	ctot=apply(count_orf,1,sum)
	cnorm=NULL
	for (j in 1:dim(count_orf)[1] ){
		counti=count_orf[j,49:100]
		normi=counti/sum(counti)
		cnorm=rbind(cnorm, normi)
	}
	#cnorm=data.frame(cnorm)
	cnorm=cnorm[order(ctot),]
	image(as.matrix(t(log2(cnorm))), col=colh, axes=F)
	axis(1, at=seq(2,50,by=3)/51, labels=F)
	
	mtext(seq(1,50,by=6), side = 1, line = 1, outer = F, 
	at = seq(2,50,by=6)/51, cex=0.8)
	mtext("Position relative to start codon\n (in nucleotides)", side=1, line=4, cex=0.8)
	
	
	mtext(paste(haploi, "ORFsT1 n =", dim(cnorm)[1]), side=1, line=6, cex=0.8)
	
dev.off()




#look conservation between translatecd and not translated
datacons=read.table("../tables_out/02conservation/conservation_table_spar.txt",colClasses=c(
"character","character","character","numeric", "character",rep("numeric",8)))

datacons$age=rep(0,dim(datacons)[1])
datacons$age[datacons$N1 ==0 & datacons$N2==0]=3
datacons$age[datacons$N1 ==1 & datacons$N2==0]=2
datacons$age[datacons$N2 ==1]=1

consred=datacons[,c(1,13)]

data_orf=merge(data_bilan[data_bilan$type=="orf",], consred, by.x="name", by.y="orf", all.x=F, all.y=F)

#compare translated vs non translated conservation of old orfs 
colp=brewer.pal(5, "YlGnBu")
png("../figures/03plastid/conservation_t0vst1.png", res=100,width = 800, height = 500, )
	
	par(mfrow=c(1,2))
	#colb=rev(brewer.pal(n = 11, name ="BrBG"))[3]
	
	n2t0=data_orf[data_orf$age.x==1 & data_orf$sig==0,]
	barplot(table(n2t0$cons)[2:6], col=colp[2], border="white",
	names.arg=c("N2\nCons","N2\nSpar", "N2\nDiv", "N2\nDivG","N2\nPol"),ylab="Number of ORFs" )
	
	n2t1=data_orf[data_orf$age.x==1 & data_orf$sig==1,]
	barplot(table(n2t1$cons)[2:6],col=colp[5], border="white",
	names.arg=c("N2\nCons","N2\nSpar", "N2\nDiv", "N2\nDivG","N2\nPol"), ylab="Number of ORFs")

dev.off()

n2t=table(data_orf$sig[data_orf$age.x==1], data_orf$cons[data_orf$age.x==1])[,2:6]
  
chisq.test(n2t)

#NS 
#barplot sig
ccol=c("seagreen","springgreen3", "darkorchid1","lightslateblue", 
"purple","midnightblue")


sig_orf=data_orf[data_orf$sig==1 & data_orf$type=="orf",]
tsig=table(sig_orf$cons, sig_orf$age.x)


notsig_orf=data_orf[data_orf$sig==0 & data_orf$type=="orf",]
notsig=table(notsig_orf$cons, notsig_orf$age.x)




tiff(filename="../figures/02conservation/Figure3_barplot_cons.tiff",  
width = 800, height = 500, units = "px", res=120)
	par( mfrow=c(2,1), mar=c(3,4,1,1))
	#all
	barplot(notsig[2:6,],beside=T, col=ccol[c(2:6)], border="white",names=c("N2","N1","Term"), 
	ylab="ORFT0 Numbers",legend.text=c("Cons","Spar", "Div","DivG","Pol"), xlim=c(0,25))

	barplot(tsig[2:6,],beside=T, col=ccol[c(2:6)],border="white",,names=c("N2","N1","Term"), 
	ylab="ORFT1 Numbers",xlim=c(0,25))

dev.off()

