#script to detect initiation peak and phasing from 
#5' RPF read density obtained with plastid 

file_in="../../04plastid/03_density_analysis/table_density_all.txt"
file_out="../tables_outrevised/03plastid/table_all_stat.txt"
#haplo="SA03"
minread_peak=5
minread_peakb=5

minread_phas=15
minread_phasb=15
maxpos=101
poslim=101
pval_max=0.05
peakmin_order=1


#Prepare data
#=====================================================================

data_all=read.table(file_in, header=T)

##remove orf not conserved after filtering if plastid not run with the new dataset
data_consorf=read.table("../../00scripts_ps24/tables_out/02conservation/conservation_table_spar.txt",header=T)
orf_names=data_consorf$orf

#remove ORFs with known similarity in prot db
names_out=c("ORF_19174","ORF_153326","ORF_164328","ORF_168700",
	"ORF_165864","ORF_194483","ORF_131221","ORF_15103",
	"ORF_1810","ORF_190389","ORF_131209","ORF_86880")

data_orfsel1=data_all[data_all$name %in% orf_names,]
'%ni%' <- Negate('%in%')
data_orfsel=data_orfsel1[data_orfsel1$name %ni% names_out,]

data_genesel=data_all[data_all$type=="gene",]
data_all=rbind(data_orfsel, data_genesel)



data_bilan=NULL
chaplo=unique(data_all$haplo)

for (i in 1:length(chaplo)){
	#Par haplo 
	#=====================================================================

	haplo=chaplo[i]
	data=data_all[data_all$haplo==haplo,]
	
	
	#matrix with count aligned reads
	count=data.frame(matrix(as.numeric(unlist(strsplit(as.character(data$count), 
	"-"))), ncol=maxpos, byrow=TRUE))
	
	rownames(count)=data$name
	#nbread counts
	data$nbread_start=apply(count[,50:53], 1, sum)
	data$nbread_phas=apply(count[,54:poslim], 1, sum)
	
	#create vector with values of the highest  peak per orf or gene 
	topcount=data$max1
	
	if(haplo=="SD06"){
		minread_peaki=minread_peakb
	}
	else{
		minread_peaki=minread_peak
	}

	#peak detection 1=peak 2 = highest peak
	#select 0 : unprecise peak
	select0=count$X51> count$X53 & count$X51 > count$X49 & 
	data$nbread_start >= minread_peaki
	
	#select 1 precise peak 
	select1=count$X51> count$X52 & count$X51 > count$X50 & 
	data$nbread_start >= minread_peaki
	
	#select2 highest peak
	select2=count$X51 >=data$max1 & 
	data$nbread_start >= minread_peaki
	
	data$peak=rep(0, dim(data)[1])
	data$peak[select0]=1
	data$peak[select1]=2
	data$peak[select1 & select2]=3
	
	#table nb peak 
	table(data$peak, data$type)

	#phasing test
	#=====================================================================
	#phases coord (without the first start codon)
	p1=seq(54,poslim, by=3)
	p2=seq(55,poslim, by=3)
	p3=seq(56,poslim, by=3)

	data$p1=apply(count[,p1], 1,sum)
	data$p2=apply(count[,p2], 1,sum)
	data$p3=apply(count[,p3], 1,sum)
	
	#test binomial
	data$pval=rep(NA,dim(data)[1])
	
	#change min read number for SD06 because less reads
	if(haplo=="SD06"){
		minread_phasi=minread_phasb
	}else{
		minread_phasi=minread_phas
	}

	for (k in 1:dim(data)[1]){
		nbi=data$nbread_phas[k]
		pi=data$p1[k]
		if (nbi >= minread_phasi & pi > 0) { 
			testi=binom.test(pi, nbi, 1/3,alternative ="greater")
			data$pval[k]=testi$p.value
		
		}
	}
	nb_test=length(data$pval[data$nbread_phas >= minread_phasi & data$p1 > 0])
	data$padj=p.adjust(data$pval, method="fdr", n=nb_test)
	
	data_red=data.frame(haplo=data$haplo ,cons=data$cons, name=data$name,
	type=data$type, nbread_start=data$nbread_start, nbread_phas=data$nbread_phas, 
	p1=data$p1,p2=data$p2,p3=data$p3, peak=data$peak, pval=data$pval, 
	padj=data$padj, counts=data$counts)

	data_bilan=rbind(data_bilan, data_red)

}

#add significance column = 
#peak != 0 
#nbread start >5
#nbread phas > 15
# significant padj value <= 0.05 
select_sig=data_bilan$peak != "0" & data_bilan$padj <= pval_max & 
!is.na(data_bilan$padj)

data_bilan$sig=rep(0, dim(data_bilan)[1])
data_bilan$sig[select_sig]=1

# save stat table
#=====================================================================
write.table(data_bilan, file_out, sep="\t", quote=F)
