#script to select recent expression increase based on RPF data
#Here we don't take into acount ORF presence absence
#an increase can be due to an ORF gain or to an expression increase

#parameters for significance
pval=0.05 #pval max
logfold=2 # minimum logfold change
minread_start=8 #change min read start to check the correct orf increase 

#stat data with RPF stat obtained with Deseq2
stat_rpf=read.table( "../tables_out/04counts/Deseq2/1.RPF_stat_deseq2.txt", header=T)

#add feature type ORF or gene 
stat_rpf$type=rep(NA, dim(stat_rpf)[1])
stat_rpf$type[grep("gene_id=", stat_rpf$name)]= "gene"
stat_rpf$type[grep("ORF", stat_rpf$name)]= "ORF"

#check numbers 
table(stat_rpf$type)/6

#select stat for ORFs
data_orf=stat_rpf[stat_rpf$type=="ORF",]
data_orf$pairs=paste(data_orf$haplo1, "-",data_orf$haplo2,sep="" )


#add read_counts to remove stat with low read counts( => tends to increase fold change)
datacount=read.table("../tables_out/04counts/Deseq2/1.counts_all_RPF_TOT_notnorm.txt")
select_rpf=c(1,2,5,6,9,10, 13,14)
data_rpf=datacount[,select_rpf]

count_pairs=rbind(
	data.frame(
		id=rownames(data_rpf), pair=rep("Cer-AA", dim(data_rpf)[1]),
		count1_rep1=data_rpf$RPF_Cer1, count1_rep2=data_rpf$RPF_Cer3,
		count2_rep1=data_rpf$RPF_AA1, count2_rep2=data_rpf$RPF_AA3), 
	
		data.frame(
		id=rownames(data_rpf), pair=rep("Cer-BB", dim(data_rpf)[1]),
		count1_rep1=data_rpf$RPF_Cer1, count1_rep2=data_rpf$RPF_Cer3,
		count2_rep1=data_rpf$RPF_BB1, count2_rep2=data_rpf$RPF_BB3), 
 
		data.frame(
		id=rownames(data_rpf), pair=rep("Cer-CC", dim(data_rpf)[1]),
		count1_rep1=data_rpf$RPF_Cer1, count1_rep2=data_rpf$RPF_Cer3,
		count2_rep1=data_rpf$RPF_CC1, count2_rep2=data_rpf$RPF_CC2), 
	
		data.frame(
		id=rownames(data_rpf), pair=rep("AA-BB", dim(data_rpf)[1]),
		count1_rep1=data_rpf$RPF_AA1, count1_rep2=data_rpf$RPF_AA3,
		count2_rep1=data_rpf$RPF_BB1, count2_rep2=data_rpf$RPF_BB3), 

		data.frame(
		id=rownames(data_rpf), pair=rep("AA-CC", dim(data_rpf)[1]),
		count1_rep1=data_rpf$RPF_AA1, count1_rep2=data_rpf$RPF_AA3,
		count2_rep1=data_rpf$RPF_CC1, count2_rep2=data_rpf$RPF_CC2), 

		data.frame(
		id=rownames(data_rpf), pair=rep("BB-CC", dim(data_rpf)[1]),
		count1_rep1=data_rpf$RPF_BB1, count1_rep2=data_rpf$RPF_BB3,
		count2_rep1=data_rpf$RPF_CC1, count2_rep2=data_rpf$RPF_CC2)

)
count_pairs$id2=paste(count_pairs$id, ":",count_pairs$pair,sep="")

#merge reads with data_orf
data_orf$id2=paste(data_orf$name, ":", data_orf$pairs, sep="")
data_orf=merge(data_orf, count_pairs, by.xy="id2", all.x=T, all.y=F)

#add column with ORF name
data_orf$orf_name=matrix(unlist(strsplit(as.character(data_orf$name), ";")),ncol=2, byrow=TRUE)[,1]

#and significance
data_orf$sig=rep(0,dim(data_orf)[1])
select_sig=data_orf$padj <= pval & 
	abs(data_orf$log2FoldChange) >= logfold &
	apply(data_orf[,11:14],1,max) >=10

	
data_orf$sig[select_sig]=1


#remove not conserved ORF after new filtering 
datacons=read.table("../tables_out/02conservation/conservation_table_spar.txt", header=T)
selorfs=datacons$orf

data_orf=data_orf[data_orf$orf_name %in% selorfs,]


#write data_orf table

write.table(data_orf, "allORF_RPF_pairstat.txt", col.names=T, 
sep="\t", quote=F, row.names=F)



hist(data_orf$log2FoldChange)


#make a bilan table with one line per ORF

data_bilan=data.frame(orf_name=unique(data_orf$orf_name))
cpairs=c("Cer-AA","Cer-BB","Cer-CC", "AA-BB", "AA-CC","BB-CC")

for(i in 1:length(cpairs)){
	
	datai=data_orf[data_orf$pairs==cpairs[i],]
	datai_red=datai[,c(15,8,5,6,16,11,12,13,14)]
	colnames(datai_red)=c(
		"orf_name",
		paste("pair_",i,sep=""), 
		paste("logfold_",i,sep=""), 
		paste("padj_",i,sep=""), 
		paste("sig_",i,sep=""),
		paste("count1_1_",i,sep=""),
		paste("count1_2_",i,sep=""),
		paste("count2_1_",i,sep=""),
		paste("count2_2_",i,sep="")
		)
	data_bilan=merge(data_bilan, datai_red, by.xy="orf_name", all.x=T)

}

#select lines with or without significants

tsig=data_bilan[,c(5,13,21,29,37,45)]
rownames(tsig)=data_bilan$orf_name
table(apply(tsig,1,sum)==0) #289 lines with at least one significant difference
data_notsig=data_bilan[apply(tsig,1,sum)==0,]
sel_nc=data.frame(orf_name=data_notsig[,1], select=rep("NS",dim(data_notsig)[1]),exp=rep("NS",dim(data_notsig)[1]))

#select lines with at least one significant diff
data_sig=data_bilan[apply(tsig,1,sum)!=0,]


#=======================================================================
# select lineage specific expression increase ==========================
#=======================================================================
#add ages 
data_cons=read.table("../tables_out/02conservation/conservation_table_spar.txt")
consred=data_cons[,c(1,4,6:12)]
data_sig=merge(data_sig, consred, by.x="orf_name", by.y="orf", all.x=T, all.y=F)

#read translation info to check if the orf is significantly translated in the samelineage 
datat=read.table("../tables_out/03plastid/table_all_stat.txt")

#take orf names with translatin signature per haplo
t_A=datat$name[datat$haplo=="SD01" & datat$sig==1 & datat$type =="orf" & datat$nbread_start >=minread_start ]
t_B=datat$name[datat$haplo=="SD06" & datat$sig==1 & datat$type =="orf" & datat$nbread_start >=minread_start]
t_C=datat$name[datat$haplo=="SA03" & datat$sig==1 & datat$type =="orf"& datat$nbread_start >=minread_start]


# C specific increase : pairs 3-5-6
select_C= data_sig$sig_3==1 & data_sig$logfold_3 <0 &
			data_sig$sig_5==1 & data_sig$logfold_5 <0 &
			data_sig$sig_6==1 & data_sig$logfold_6<0 &
			data_sig$C != 0

data_C=data_sig[select_C,]
data_C$select=rep("C", dim(data_C)[1])
data_C$exp=rep("Inc", dim(data_C)[1])
data_C=data_C[data_C$orf_name %in% t_C,]

# B specific increase : pairs 2-4-6
select_B= data_sig$sig_2==1 & data_sig$logfold_2 <0 &
			data_sig$sig_4==1 & data_sig$logfold_4 <0 &
			data_sig$sig_6==1 & data_sig$logfold_6>0 &
			data_sig$B != 0

data_B=data_sig[select_B,]
data_B$select=rep("B", dim(data_B)[1])
data_B$exp=rep("Inc", dim(data_B)[1])
data_B=data_B[data_B$orf_name %in% t_B,]


# A specific increase : pairs 1-4-5
select_A= data_sig$sig_1==1 & data_sig$logfold_1 <0 &
			data_sig$sig_4==1 & data_sig$logfold_4 >0 &
			data_sig$sig_5==1 & data_sig$logfold_5>0 &
			data_sig$A != 0

data_A=data_sig[select_A,]
data_A$select=rep("A", dim(data_A)[1])
data_A$exp=rep("Inc", dim(data_A)[1])
data_A=data_A[data_A$orf_name %in% t_A,]

#BC specific increase : pairs 2-3-4-5
select_BC= data_sig$sig_2==1 & data_sig$logfold_2 <0 &
			data_sig$sig_3==1 & data_sig$logfold_3 <0 &
			data_sig$sig_4==1 & data_sig$logfold_4 <0 &
			data_sig$sig_5==1 & data_sig$logfold_5 <0 &
			data_sig$C != 0 & data_sig$B != 0

data_BC=data_sig[select_BC,]
data_BC$select=rep("BC", dim(data_BC)[1])
data_BC$exp=rep("Inc", dim(data_BC)[1])

data_BC=data_BC[data_BC$orf_name %in% t_C | data_BC$orf_name %in% t_B,]


select_ORF_inc=rbind(data_C[,c(1,58,59)],data_B[,c(1,58,59)],
data_A[,c(1,58,59)],data_BC[,c(1,58,59)] )

table(select_ORF_inc$select)

#write.table(select_ORF_inc,"select_ORF_022018", sep="\t", quote=F,col.names=F, row.names=F )

#=======================================================================
# select lineage specific expression DECREASE ==========================
#=======================================================================


# C specific increase : pairs 3-5-6
select_C= data_sig$sig_3==1 & data_sig$logfold_3 >0 &
			data_sig$sig_5==1 & data_sig$logfold_5 >0 &
			data_sig$sig_6==1 & data_sig$logfold_6>0 
			

data_C=data_sig[select_C,]
data_C$select=rep("C", dim(data_C)[1])
data_C$exp=rep("Dec", dim(data_C)[1])


# B specific increase : pairs 2-4-6
select_B= data_sig$sig_2==1 & data_sig$logfold_2 >0 &
			data_sig$sig_4==1 & data_sig$logfold_4 >0 &
			data_sig$sig_6==1 & data_sig$logfold_6<0 
			
data_B=data_sig[select_B,]
data_B$select=rep("B", dim(data_B)[1])
data_B$exp=rep("Dec", dim(data_B)[1])



# A specific increase : pairs 1-4-5
select_A= data_sig$sig_1==1 & data_sig$logfold_1 >0 &
			data_sig$sig_4==1 & data_sig$logfold_4 <0 &
			data_sig$sig_5==1 & data_sig$logfold_5<0 
			
data_A=data_sig[select_A,]
data_A$select=rep("A", dim(data_A)[1])
data_A$exp=rep("Dec", dim(data_A)[1])

#BC specific increase : pairs 2-3-4-5
select_BC= data_sig$sig_2==1 & data_sig$logfold_2 >0 &
			data_sig$sig_3==1 & data_sig$logfold_3 >0 &
			data_sig$sig_4==1 & data_sig$logfold_4 >0 &
			data_sig$sig_5==1 & data_sig$logfold_5 >0 


data_BC=data_sig[select_BC,]
data_BC$select=rep("BC", dim(data_BC)[1])
data_BC$exp=rep("Dec", dim(data_BC)[1])



select_ORF_dec=rbind(data_C[,c(1,58,59)],data_B[,c(1,58,59)],
data_A[,c(1,58,59)],data_BC[,c(1,58,59)] )
#=========================================================================

#=======================================================================
# select Spar specific expression increase =============================
#=======================================================================


# Spar specific increase : pairs 1-2-3
select_spar= data_sig$sig_1==1 & data_sig$logfold_1 <0 &
			data_sig$sig_2==1 & data_sig$logfold_2 <0 &
			data_sig$sig_3==1 & data_sig$logfold_3<0 &
			data_sig$C != 0 & data_sig$B != 0 & data_sig$A != 0

data_spar=data_sig[select_spar,]
data_spar$select=rep("ABC", dim(data_spar)[1])
data_spar$exp=rep("Inc", dim(data_spar)[1])
#data_spar=data_spar[data_spar$orf_name %in% t_C & data_spar$orf_name %in% t_B &
#	data_spar$orf_name %in% t_A ,]


select_ORF_spar=data_spar[,c(1,58,59)]

bilan_all=rbind(select_ORF_inc,select_ORF_dec,select_ORF_spar, sel_nc)

write.table(bilan_all,"80518_expressiongroup_ORF.txt", sep="\t", quote=F,col.names=F, row.names=F )



