#Script to draw figure 4 
#multivariate analysis 
#boxplots properties
#test PCA

mintot=10 #minimum total RNA read number for TE comparisons (to avoid overestimation due to very low read counts) 

cexi=0.8
cexj=0.7

lettersize=1.2

ccol=c("grey22","springgreen3", "purple","lightslateblue","midnightblue")
colreg=c(1, "purple3")

source("myBiplot.R") 
library(RColorBrewer)

#Functions #############################################################
#make transparent colors
makeTransparent<-function(someColor, alpha=100)
{
  newColor<-col2rgb(someColor)
  apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
    blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}

#plot paiwise comparisons
pairwise_test=function(select_test,c1,c2,n1,n2){
	plot(1:2,1:2, type="n", axes=F, xlab="", ylab="",
		xlim=c(0,max(c2)+1), ylim=c(0,length(c2)+2))
	
	#countsig=0
	for (i in 1:length(c1)){
		
		f1=n1[i]
		f2=n2[i]
		
		g1=c1[i]
		g2=c2[i]

		testg=wilcox.test(select_test$trait[select_test$factor==f1],select_test$trait[select_test$factor==f2])
		pvalg=testg$p.value
		print (pvalg)
		if (pvalg <= 0.05){
			#countsig=countsig+1
			lines(c(g1+0.5,g2+0.5), c(i,i))
			
			#add text * 
			if (pvalg <= 0.001){
				labg="***" 
			}
			if (pvalg <=0.01 & pvalg > 0.001){
				labg="**" 
			}
				if (pvalg <=0.05 & pvalg > 0.01){
				labg="*" 
			}
			text((g1+g2+1)/2,i+0.4,label=labg,cex=1.5)
		}
	}
}

pairwise_test2=function(select_test,c1,c2,n1,n2){
	plot(1:2,1:2, type="n", axes=F, xlab="", ylab="",
		xlim=c(0,max(c2)+1), ylim=c(0,length(c2)+2))
	
	#countsig=0
	for (i in 1:length(c1)){
		
		f1=n1[i]
		f2=n2[i]
		
		g1=c1[i]
		g2=c2[i]

		testg=wilcox.test(select_test$trait[select_test$factor==f1],select_test$trait[select_test$factor==f2])
		pvalg=testg$p.value
		print (pvalg)
		if (pvalg <= 0.05){
			#countsig=countsig+1
			lines(c(g1+0,g2+0.2), c(i,i))
			
			#add text * 
			if (pvalg <= 0.001){
				labg="***" 
			}
			if (pvalg <=0.01 & pvalg > 0.001){
				labg="**" 
			}
				if (pvalg <=0.05 & pvalg > 0.01){
				labg="*" 
			}
			text((g1+g2+0)/2,i+0.4,label=labg,cex=1.5)
		}
	}
}


make_tableimage=function(selrange){
	#calculate mean per factor and soze range
	 
	selrange$idrange=paste(selrange$factor, "-",selrange$rangei, sep="")
	meantrait=tapply(selrange$trait, selrange$idrange, mean)

	ttrait=data.frame(idrange=names(meantrait), meantrait=meantrait)
	ttrait$rangei=as.numeric(matrix(unlist(strsplit(as.character(ttrait$idrange), "-")), ncol=2, byrow=T)[,2])
	ttrait$factor=as.numeric(matrix(unlist(strsplit(as.character(ttrait$idrange), "-")), ncol=2, byrow=T)[,1])
	
	#prepare a matrix for image per age
	nbranges=max(ttrait$rangei)
	nbfact=max(ttrait$factor)+1
	
	tplot_trait=matrix(rep(NA, nbranges*nbfact), ncol=nbranges)
	for (coli in 1:nbranges){
		for(rawi in 1:nbfact){
			select=ttrait$meantrait[ttrait$rangei==coli & ttrait$factor+1==rawi]
			if(length(select !=0)){
				tplot_trait[rawi,coli]=ttrait$meantrait[ttrait$rangei==coli & ttrait$factor+1==rawi]
			}
		}
	}
	return(tplot_trait)
} 

#Analysis #############################################################

#read data bilan 
data=read.table("../../00_tables/table_S2_metaexp_withRT.txt", header=T)
data_consorf=read.table("../../00_tables/02conservation/conservation_table_spar.txt",header=T)
orf_names=data_consorf$orf

data_orfsel=data[data$feat_name %in% orf_names,]
data_genesel=data[data$type=="gene",]
data=rbind(data_orfsel, data_genesel)



#remove not conserved ORF after filtering



#add a cons group with numbers to help for vizualisation 
data$cons2=as.character(data$cons)
data$cons2[data$cons=="Gene"]="0_gene"
data$cons2[data$cons=="Cons"]="1_Cons"

data$cons2[data$cons=="Spar"]="2_Spar"
data$cons2[data$cons=="Div"]="3_Div"
data$cons2[data$cons=="DivG"]="4_DivG"
data$cons2[data$cons=="Pol"]="5_Pol"




#######################################################################
# graphic with boxplots, heatmap and correlations
#######################################################################

#take haplotyopes with significant translation signatures
#data_sig=data[data$sig=="1",]
data_sig=data[data$sig=="1" ,]

#remove data with tot read < min tot to don t affect TE 
data_sig=data_sig[data_sig$TOT_start >=mintot,]


##########################################################################
#Gaphique 2 boxplot 
##########################################################################

#tiff(paste("Figures_manuscript/4.Figure4_properties_mintot_V2",mintot,".tiff",sep=""), width = 950, height = 1200,res=130)
tiff(paste("Figures_manuscript/4.Figure4_properties_mintot_V2",mintot,".tiff",sep=""), width = 5000, height = 6800,res=820)

laymat=matrix(c(1,2,3,6,7,8,17,11,12,4,15,15,9,15,15,13,15,15,5,15,15,10,15,15,14,16,16), byrow=T, ncol=3)


#layout(laymat,heights=c(2,5,2,2,5,2,4,6,2), widths=c(6,6,6))
layout(laymat,heights=c(2,5,2,2,5,2,2,5,7), widths=c(6,6,6))
#layout.show(n=16)



fact=paste(data_sig$age,";",data_sig$lineage,sep="")
#data_sig=data_sig[fact !="3;SpA",]
#===================================================================
# Age effect 
#==================================================================
	#Size per Age 
	eff1=table(data_sig$age)
	class1=c("Gen","N2", "N1","Term")
	cnames1=paste(class1,"\n n=",eff1, sep="")


	# t tests====================================================
	#between pairs
	c1=c(0,0,0,1,1,2)
	c2=c(1,2,3,2,3,3)
	par(mar=c(0,5,0,2))
	
	#size
	select_test=data.frame(trait=log2(data_sig$size_aa), factor=data_sig$age)
	pairwise_test(select_test,c1,c2,c1,c2) #plot comparisons tests
	title("A",cex.main=lettersize,adj=0, line=-2,outer=F)
	
	#ISD
	select_test=data.frame(trait=data_sig$mean_dis, factor=data_sig$age)
	pairwise_test(select_test,c1,c2,c1,c2) #plot comparisons tests
	title("B",cex.main=lettersize,adj=0, line=-2,outer=F)
	
	#GC
	select_test=data.frame(trait=data_sig$GC, factor=data_sig$age)
	pairwise_test(select_test,c1,c2,c1,c2) #plot comparisons tests
	title("C",cex.main=lettersize,adj=0, line=-2,outer=F)
	
	#c1=c(1,1,2)
	#c2=c(2,3,3)

	#snp rate 
	select_test=data.frame(trait=data_sig$snp_l, factor=data_sig$age)
	pairwise_test(select_test,c1,c2,c1,c2) #plot comparisons tests
	title("D",cex.main=lettersize,adj=0, line=-2,outer=F)
	
	c3=c(1,1,2)
	c4=c(2,3,3)

	#dist gene
	select_test=data.frame(trait=log2(data_sig$dist_gene+1), factor=data_sig$age)
	pairwise_test(select_test,c3,c4,c3,c4) #plot comparisons tests
title("E",cex.main=lettersize,adj=0, line=-2,outer=F)
	
	# boxplot
	#==================================================================
	par(mar=c(4,5,0,1), cex.axis=cexi)
	
	#size
	plot(as.factor(as.character(data_sig$age)), log2(data_sig$size_aa), names=class1, ylab="",cex=cexi)
	mtext("Size ", side=2, line=3,cex=cexj)
	mtext(expression(paste("(" ,log[2]," amino acids)",sep="")), side=2, line=2,cex=cexj)
	
	
	#ISD
	plot(as.factor(as.character(data_sig$age)), data_sig$mean_dis, names=class1, ylab="ISD",cex=cexi)
	
	#GC
	plot(as.factor(as.character(data_sig$age)), data_sig$gc, names=class1, ylab="GC %", cex=cexi)
	
	
	#snp rate 
	plot(as.factor(as.character(data_sig$age)), data_sig$snp_l, names=class1, ylab="SNP rate", cex=cexi)
	
	
		#dist gene
	plot(as.factor(as.character(data_sig$age[data_sig$type=="orf"])),
	 log2(data_sig$dist_gene[data_sig$type=="orf"]+1), names=class1[2:4], ylab="",cex=cexi) 
	mtext("Distance to closest", side=2, line=3,cex=cexj)
	mtext(expression(paste("gene (" ,log[2],")",sep="")), side=2, line=2,cex=cexj)
	
	##################################################################
	#heatmap with separated range sizes 
	##################################################################
	
	#we  use log2 values for break ranges 
	
	data_sig$log2=log2(data_sig$size_aa)
	
	#split by range size 
	split_table=split(data_sig,cut(data_sig$log2,c(seq(4.3,13,by=0.6))))
	range_names=attributes(split_table)$names
	
	#merge split tables in data_range with range classes
	data_range=NULL
	for (rangei in 1:length(range_names)){
		subi=data.frame(split_table[range_names[rangei]])
		subi$rangei=rep(rangei,dim(subi)[1])
		subi$rangeval=rep(range_names[rangei],dim(subi)[1])
		colnames(subi)=c(colnames(data_sig),"rangei", "rangeval")
		data_range=rbind(data_range, subi)
	}

	#check effectifs
	table(data_range$rangei, data_range$type)
	
#plot images ########################################################
	par(mar=c(2,3,0,1))
	#Per age============================================================= 
	rval1=matrix(unlist(strsplit(sub("(","",range_names,fixed=T),",")),ncol=2,byrow=T)[,1]
	rval1=round(2^as.numeric(rval1))
	#rval1=rval1[seq(1,length(rval1),by=2)]

		
	#ISD 
	selrange=data.frame(factor=data_range$age, trait=data_range$mean_dis, 
		rangei=data_range$rangei)
	tplottrait=make_tableimage(selrange)
	image(t(tplottrait),col=brewer.pal(n = 9, name ="BuGn"),axes=F)
	mtext(class1, side = 2, outer = F, at = (0:3)/3, cex=0.5,las=2)
	mtext(rval1, side = 1, outer = F, at = (0:13)/13, cex=0.5,las=2)
	
	#GC
	selrange=data.frame(factor=data_range$age, trait=data_range$gc, 
		rangei=data_range$rangei)
	tplottrait=make_tableimage(selrange)
	image(t(tplottrait),col=brewer.pal(n = 9, name ="BuGn"),axes=F)
	mtext(class1, side = 2, outer = F, at = (0:3)/3, cex=0.5,las=2)
	mtext(rval1, side = 1, outer = F, at = (0:13)/13, cex=0.5,las=2)

	#snp rate 
	selrange=data.frame(factor=data_range$age, trait=data_range$dsnp_l, 
		rangei=data_range$rangei)
	tplottrait=make_tableimage(selrange)
	image(t(tplottrait),col=brewer.pal(n = 9, name ="BuGn"),axes=F)
	mtext(class1, side = 2, outer = F, at = (0:3)/3, cex=0.5,las=2)
	mtext(rval1, side = 1, outer = F, at = (0:13)/13, cex=0.5,las=2)
	
    par(mar=c(11,3,0,1))
	
		#dist genes
	selrange=data.frame(factor=data_range$age[data_range$type=="orf"], trait=log2(data_range$dist_gene[data_range$type=="orf"]+1), 
		rangei=data_range$rangei[data_range$type=="orf"])
	tplottrait=make_tableimage(selrange)
	image(t(tplottrait),col=brewer.pal(n = 9, name ="BuGn"),axes=F)
	mtext(class1, side = 2, outer = F, at = (0:3)/3, cex=0.5,las=2)
	mtext(rval1[1:5], side = 1, outer = F, at = (0:4)/4, cex=0.5,las=2)
	


	#1 PCA ==================================================================

#prep table avec expression data
#


data_pca=data.frame(type=data$type,SIZE=log2(data$size_aa),ISD=data$mean_dis, 
	RPF=log2(data$RPF_start+1), 
	TOT=log2(data$TOT_start+1), 
	TE=data$TE_start,
	SNP=data$snp_l,
	GC=data$gc,
	 cons=data$cons,
	 age=data$age,
	 name=data$feat_name)
	 
data_pca=data_pca[data$sig=="1" & !is.na(data_pca$TE),]
data_pca=data_pca[data_pca$TE !="Inf",]

#PCA with prcomp========================================================
pca=prcomp(data_pca[,c(2:8)],center = TRUE, scale = TRUE, retx=TRUE)
summary(pca)
pca$rotation

#standard deviation
pca$sdev 

# Proportion de variance de chaque trait   
var_all=100 * pca$sdev^2 / sum(pca$sdev^2)


# variance totale PC1 et PC2
sum(100 * (pca$sdev^2)[1:2] / sum(pca$sdev^2))

pc1var=round(sum(100 * (pca$sdev^2)[1] / sum(pca$sdev^2)))

pc2var=round(sum(100 * (pca$sdev^2)[2] / sum(pca$sdev^2)))




#plot PCA========================================================


#
#graphique avec tout	
par(mar=c(4,4,4,4),cex.axis=cexi)
	
	cola=c("seagreen3","slateblue3","tomato")
	myBiplot(pca, choices=1:2,type = "n", pch=20, col=ccoli, 
	col.text = makeTransparent(1,0),col.arrows=1,ylim=c(-0.06,0.05),xlim=c(-0.05,0.05), 
	xlab=paste("PC1 (",pc1var,"%)",sep=""),ylab=paste("PC2 (",pc2var,"%)",sep=""))
	#ajout des genes au premier plan
	
	points(pca$x[, 1:2][data_pca$type=="gene",], type="p",pch=20, col=makeTransparent("grey30",100))
	points(pca$x[, 1:2][data_pca$type=="orf",], type="p",pch=20, col=makeTransparent(ccol[3],250))
	points(pca$x[, 1:2][data_pca$age=="1",], type="p",pch=20, col=cola[1])
	points(pca$x[, 1:2][data_pca$age=="2",], type="p",pch=20, col=cola[2])
	points(pca$x[, 1:2][data_pca$age=="3",], type="p",pch=20, col=cola[3])
	title("F",cex.main=lettersize,adj=0, line=3,outer=F)

	
	
	par(new=TRUE)
	myBiplot(pca, choices=1:2,type = "n", pch=20, col=ccoli, 
	col.text = "black",col.arrows="grey10",ylim=c(-0.06,0.05),xlim=c(-0.05,0.05),cex=cexi,lwd=1.5, xlab="",ylab="")
	#legend("topright" ,c("Genes","ORF_N2", "ORF_N1","ORF_term","Sig_exp"), cex=0.9, fill=c(ccol,2), border="white")
	legend("topright" ,c("Genes","tORF_N2","tORF_N1","tORF_Term"), cex=cexi, fill=c("grey30",cola), border="white")


#plot percentage of variance 
par(mar=c(7,4,2,8),cex.axis=cexi)
barplot(var_all, xlab="Percentage of variances", ylab="", 
col="white", las=2,horiz=T,names=paste("PC",1:length(var_all),sep=""))
title("G",cex.main=lettersize,adj=0, line=0 ,outer=F)


dev.off()

