#!usr/bin/Rscript

# function to calculate null distribution intervals for various
# bags of simulated genes

# these summaries come from the initial "assign_genes_to_simbins.R" script - equivalent
# to the "sim_gene_bins_1M.txt" file that is written by that script. this one keeps its
# original name to indicate that it is the original, just giving processed one. 
counts = read.table('processed_data/reproduced_analysis/sim_gene_bins_1M.txt',header=T)

gaincounts = counts[,'gain_num']
prescounts = counts[,'prevalence']

# this threshold assumes that i want 1000 genes for each param in null, 
# but that i am only sampling 1/10 of eventual null distribution for now
calc_interval_makeplot = function(gaincounts,prescounts,outname) {

thresh = 1000

calc_extras = function(counts,thresh,increment) {
bins = 1:max(counts)
gainer = c()
print(length(counts))
for (i in seq(1,max(bins),by=increment)) {
#	print(i)
	diff = 0
	if (length(which(counts==i))<thresh)
		{
		ID= length(which(counts==i))
#		print(bigger)
		diff = 0
		bigger = ID
		while (bigger<thresh) {
		#	print(diff)
			diff = diff+increment
			in_int = length(counts[(counts<(i+diff)) & (counts>(i-diff))])
	#		print(in_int)
			bigger = bigger+in_int
	#		bigger = length(which(counts < i+diff & counts >= i))
	#		bigger = length(which(counts > i-diff & counts <= i))
	#		print(bigger)
			}
		}
	gainer[encodeString(i)] = diff
	}
	return(gainer)
}

print('gaining')
gainer = calc_extras(gaincounts,thresh,1)
#gainer = cbind(gainer,calc_extras(counts,2,500),.5)
#gainer = cbind(gainer,calc_extras(counts,2,1000),.5)
#write.table(gainer,paste(outname,'gainbinradius_022214.txt',sep='.'))

print('presing')
preser = calc_extras(prescounts,thresh,1)
#preser = cbind(preser,calc_extras(counts,1,500,.5))
#preser1 = calc_extras(counts,1,1000,.5)
#preser2 = calc_extras(counts,1,2000,.5)
#write.table(preser,paste(outname,'presbinradius_022214.txt',sep='.'))

print(length(seq(1,max(prescounts),by=1)))
print(length(preser))

print(summary(preser))
print(summary(gainer))

#pdf(paste(outname,'pres_bin_cdf_073014.pdf',sep='.'))
par(mar=c(5, 5, 4, 5) + 0.1)
hist(prescounts,100,xlab='# pres',ylab= 'Frequency of genes',ylim=c(0,3000),main=paste(outname,'prevalence'))
axis(4, ylim=c(0,80),col="blue",lwd=2,labels = c(0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80),at = seq(0,3000,by=(3000/16)))
lines(seq(1,max(prescounts),by=1),preser*3000/80,lwd=2,col='blue')
#lines(seq(0.25,max(prescounts),by=.5),preser2*600,lwd=2,col='red')
mtext(4,text="Extra radius around point to fill null distribution",line=2)
#dev.off()

#pdf(paste(outname,'gain_bin_cdf_073014.pdf',sep='.'))	
par(mar=c(5, 5, 4, 5) + 0.1)
hist(gaincounts,45,xlab='# gains',ylab= 'Frequency of genes',ylim=c(0,10000),main=paste(outname,'gains'))
axis(4, ylim=c(0,30),col="red",lwd=2,labels = c(0,5,10,15,20,25,30,35,40,45,50),at = seq(0,10000,by=10000/10))
lines(seq(1,max(gaincounts),by=1),gainer*200,lwd=2,col='red')
#lines(seq(0.25,max(gaincounts),by=.5),gainer[,2]*1000,lwd=2,col='blue')
#lines(seq(0.25,max(gaincounts),by=.5),gainer[,3]*1000,lwd=2,col='red')
mtext(4,text="Extra radius around point to fill null distribution",line=2)
#dev.off()
}

calc_interval_makeplot(gaincounts,prescounts,'used_null')
