#!usr/bin/Rscript
# to assign real genes to bins of simulated genes
# max 8/20/14

# modified 10/22/2014 and 10/29/2014 to fit into simulation pipeline.

# num of simulated genes to throw at each real gene, for each binning parameter (presence and gains)
realbinsize = 1000
gainthresh = 2
presthresh = 2

# size of 'bin' files, of computed simulated gene x gene counts (2 'bins' together make a bin file,
# to allow overlap of bins
max_bin_size = 8000		# CHANGE IF YOU ARE CHANGING HOW YOU ARE HANDLING THE BIN FILES!!!!
				# note that the actual quantity varies among binfiles because of ties
				# should be okay as long as realbinsize is same size or smaller 
				# see simsortandbin_*.R scripts for reference
counts = read.table('processed_data/simko_summaries_022614.txt',header=T)

# make sure destination dir and important variables are specified
stopifnot(exists('working_dir'))
stopifnot(exists('pres'))
stopifnot(exists('gain'))

specnum = nrow(pres)

stopifnot(length(counts)>0)

# function to find bins for each gene - NEEDS UPDATING 
calc_extras = function(param,thresh,simparams) {
	# param is the value around which we want to construct the bin
	# i.e. prevalence or num gains.  
	# thresh is the number of genes that we want in the bin
	# sim params is a vector of values of the parameter in question for simulated genes
	diff = 0
	# keep track of simulated genes for the bin
	simgene_list = names(simparams[simparams==param])
	ID = length(simgene_list)
	
	if (ID >= 1000) {
		# this just takes the first thousand sim genes:
		# thus the input order dependence!
		simgene_list = simgene_list[1:1000]
		return(simgene_list)
		}
	# want to get the same number of genes from each side of the given parameter 
	uppergenes = ceiling((thresh - ID)/2)
	lowergenes = floor((thresh - ID)/2)
	
	# in case there aren't enough genes on the high end
	if (length(which(simparams > param)) < uppergenes) {
	#	print('accommodating lack of sims on high end')
		reweight = (uppergenes-length(which(simparams > param)))
		lowergenes = lowergenes + reweight
		uppergenes = uppergenes - reweight
		}
	
	# in case there aren't enough genes on the low end
	if (length(which(simparams < param)) < lowergenes) {
	#	print('accommodating lack of sims on low end')
		reweight = (lowergenes-length(which(simparams < param)))
		uppergenes = uppergenes + reweight
		lowergenes = lowergenes - reweight
		}
#	print('bounds to fill')
#	print(uppergenes)
#	print(lowergenes)
			
		
	# would be more elegant with vector ops or even recursion, but i am not elegant.
	while (length(simgene_list)<thresh) {
#			print('expanding null')
			diff = diff+1
			upperbound = param+diff
			lowerbound = param-diff
			
			# select genes within the increment
			# the second boolean restricts to simgenes that haven't already beend added
			lower = names(simparams[(simparams >= lowerbound) & (simparams < lowerbound+1)])
			upper = names(simparams[(simparams <= upperbound) & (simparams > upperbound-1)])
#			print('adding genes on lower and upper bounds')
#			print(length(lower))
#			print(length(upper))
			
			if  (lowergenes == 0) {
#				print('done lower')
				} else if (length(lower) >= lowergenes) {
				simgene_list = append(simgene_list,lower[1:lowergenes])
				lowergenes = 0
				} else {
				simgene_list = append(simgene_list,lower)
				lowergenes = lowergenes - length(lower)
				}

			if (uppergenes == 0) {
#				print('done upper')
				} else if (length(upper) >= uppergenes) {
				simgene_list = append(simgene_list,upper[1:uppergenes])
				uppergenes = 0
				} else {
				simgene_list = append(simgene_list,upper)
				uppergenes = uppergenes - length(upper)
				}
		
			if ((uppergenes == 0) && (lowergenes == 0)) {
				return(simgene_list)		
				}						
			}
	}

print('preprocessing')
simgains = counts[,'countedgain']
simpres = counts[,'countedpres']

names(simgains) = rownames(counts)
names(simpres) = rownames(counts)

simgained = names(simgains[simgains>=gainthresh])

# FIXES the filtering --> binning bug!  should remove the stuff we are not interested in.
simpres = simpres[simpres>=presthresh]
simpres = simpres[simpres<=(specnum-presthresh)]
simpresed = names(simpres)
both=simpresed[(simpresed %in% simgained)]

pres_total = colSums(pres)
gain_total = colSums(gain)
# now to handle real genes
prevalent = names(pres_total[(pres_total >= presthresh) & (pres_total <= nrow(pres)-presthresh)])
gained = names(gain_total[gain_total >= gainthresh])

#print(length(prevalent))
#print(length(gained))
#print(length(both))

threshed_kos = na.omit(gained[(gained %in% prevalent)])
#print(length(threshed_kos))

#print('restricting')
pres = pres[,threshed_kos]
gain = gain[,threshed_kos]

pres_total = colSums(pres)
gain_total = colSums(gain)

print('assigning null distributions for each gene')

###
# assigning nulls to genes -UNCOMMENT!!!
###
nullpres_per_gene = matrix(rep('',length(threshed_kos)*realbinsize),length(threshed_kos))
nullgain_per_gene = matrix(rep('',length(threshed_kos)*realbinsize),length(threshed_kos))
rownames(nullpres_per_gene) = threshed_kos
rownames(nullgain_per_gene) = threshed_kos

# modified 7/1/2014 to include ceiling function for purposes of null assignment
for (gene in colnames(pres)) {	
#	print(gene)
	present = pres_total[gene]
#	print(present)
	# for some reason, rownames are not getting passed on as labels here
	to_pass = counts[both,'countedpres']
	names(to_pass) = rownames(counts[both,])
	nullpres_per_gene[gene,] = calc_extras(ceiling(present),realbinsize,to_pass)
	gaint = gain_total[gene]
#	print(gaint)
	to_pass = counts[both,'countedgain']
	names(to_pass) = rownames(counts[both,])
	nullgain_per_gene[gene,] = calc_extras(ceiling(gaint),realbinsize,to_pass)
	}

# now actually assign pres/gain values to binfiles
# HERE IS WHERE THE PROBLEM IS 3/26/14
# (now fixed)
pres_values = sort(unique(pres_total))
gain_values = sort(unique(gain_total))

sim_presgene_binning =c()
real_presgene_binning = c()

real_pres_bin_genes = c()
sim_pres_bin_genes = c()
bin = 1
i = 1
added_vals = c()

# NOTE THAT THIS CODE IS DUPLICATIVE - NOT FIXING FOR NOW, CAUSE IT WORKS.
# assigning groups of sim genes into bin files according to their nulls
# REDONE TO ACCOUNT FOR SIM GENES IN MORE THAN ONE BIN
while (i <= length(pres_values)) {
#	print(i)
	next_bin = pres_total[pres_total == pres_values[i]]
#	print(next_bin[1])
	added_vals = append(added_vals,pres_values[i])

	such_gene = names(next_bin[1])

	size_of_next = length(union(sim_pres_bin_genes,nullpres_per_gene[such_gene,])) 

	if (size_of_next >= max_bin_size) {
#		sim_presgene_binning[sim_pres_bin_genes] = bin
		binned = rep(0,length(both))
		names(binned) = both
		binned[union(sim_pres_bin_genes,nullpres_per_gene[such_gene,])] = 1

		sim_presgene_binning = cbind(sim_presgene_binning,binned)
		real_presgene_binning[real_pres_bin_genes] = bin
		real_pres_bin_genes = c()
		sim_pres_bin_genes = c()
		bin = bin+1
		} else {
		sim_pres_bin_genes = union(sim_pres_bin_genes,nullpres_per_gene[such_gene,])
		real_pres_bin_genes = append(real_pres_bin_genes,names(next_bin))
		#print('increment')
		i = i+1	
		}
	
	# need to catch that last bin!
	if (i == length(pres_values)) {
	#	print('it went where it was supposed to')
		binned = rep(0,length(both))
		names(binned) = both
		binned[union(sim_pres_bin_genes,nullgain_per_gene[such_gene,])] = 1
		sim_presgene_binning = cbind(sim_presgene_binning,binned)
#		sim_presgene_binning[union(sim_pres_bin_genes,nullpres_per_gene[such_gene,]),i] = 1
#		sim_presgene_binning[!(union(sim_pres_bin_genes,nullpres_per_gene[such_gene,])),i] = 0
		real_presgene_binning[real_pres_bin_genes] = bin
		real_presgene_binning[names(next_bin)] = bin
		}
		
	}
colnames(sim_presgene_binning) = 1:ncol(sim_presgene_binning)

# for some reason putting this out here works though it doesn't in while loop
real_presgene_binning[names(next_bin)] = bin

# check that nothing is too crazy
stopifnot(length(real_presgene_binning) == length(pres_total))
stopifnot(length(unique(c(nullpres_per_gene))) == nrow(sim_presgene_binning))
	
# now for gains 'binned' to files
sim_gaingene_binning =c()
real_gaingene_binning = c()

real_gain_bin_genes = c()
sim_gain_bin_genes = c()
bin = 1
i = 1
added_vals = c()

# this is a probably unnecessarily complicated loop that actually portions real
# AND simulated genes into different bins for processing.  
while (i <= length(gain_values)) {
	#print(i)
	next_bin = gain_total[gain_total == gain_values[i]]
	#print(next_bin[1])
	added_vals = append(added_vals,gain_values[i])
	such_gene = names(next_bin[1])
	
	size_of_next = length(union(sim_gain_bin_genes,nullgain_per_gene[such_gene,])) 
	
	if (size_of_next >= max_bin_size) {
	#	sim_gaingene_binning[sim_gain_bin_genes] = bin
		binned = rep(0,length(both))
		names(binned) = both
		binned[union(sim_gain_bin_genes,nullgain_per_gene[such_gene,])] = 1
		sim_gaingene_binning = cbind(sim_gaingene_binning,binned)
		real_gaingene_binning[real_gain_bin_genes] = bin
		real_gain_bin_genes = c()
		sim_gain_bin_genes = c()
		bin = bin+1
		} else {
		sim_gain_bin_genes = union(sim_gain_bin_genes,nullgain_per_gene[such_gene,])
		real_gain_bin_genes = append(real_gain_bin_genes,names(next_bin))
		i = i+1	
		}
	
	# need to catch that last bin!
	if (i == length(gain_values)) {
	#	print('it went where it was supposed to')
		binned = rep(0,length(both))
		names(binned) = both
		binned[union(sim_gain_bin_genes,nullgain_per_gene[such_gene,])] = 1
	#	print('fail at end')
		sim_gaingene_binning = cbind(sim_gaingene_binning,binned)
	#	print('actually was okay')
#		sim_gaingene_binning[union(sim_gain_bin_genes,nullgain_per_gene[such_gene,])] = 1
		real_gaingene_binning[real_gain_bin_genes] = bin
		real_gaingene_binning[names(next_bin)] = bin
		}
		
	}
colnames(sim_gaingene_binning) = 1:ncol(sim_gaingene_binning)
	

# for some reason putting this out here works
real_gaingene_binning[names(next_bin)] = bin

#stopifnot(length(real_gaingene_binning) == length(gain_total))
#stopifnot(length(unique(c(nullgain_per_gene))) == length(sim_gaingene_binning))
	
real_genes = cbind(threshed_kos,pres_total[threshed_kos],gain_total[threshed_kos],real_presgene_binning[threshed_kos],real_gaingene_binning[threshed_kos])

colnames(real_genes) = c('Gene','prevalence','gain_num','Pres_bin','Gain_bin')

print('writing data')
write.table(real_genes,paste(working_dir,'real_gene_bins_1M.txt',sep='/'),row.name=FALSE,quote=FALSE)
remove(real_genes)

sim_genes = cbind(both,counts[both,'countedpres'],counts[both,'countedgain'])	#,rep(NA,length(both)),rep(NA,length(both)))
colnames(sim_genes) = c('Gene','prevalence','gain_num')

print('writing sims')
write.table(sim_presgene_binning[both,],paste(working_dir,'sim_presbinning.txt',sep='/'),col.names=TRUE,row.names=TRUE,quote=FALSE)
write.table(sim_gaingene_binning[both,],paste(working_dir,'sim_gainbinning.txt',sep='/'),col.names=TRUE,row.names=TRUE,quote=FALSE)

remove(sim_presgene_binning)
remove(sim_gaingene_binning)

write.table(sim_genes,paste(working_dir,'sim_gene_bins_1M.txt',sep='/'))
write.table(nullgain_per_gene,paste(working_dir,'nullgenes_for_realgenes_gain_1M.txt',sep='/'),quote=FALSE,col.name=FALSE)
write.table(nullpres_per_gene,paste(working_dir,'nullgenes_for_realgenes_presence_1M.txt',sep='/'),quote=FALSE,col.name=FALSE)

remove(sim_genes)
remove(nullpres_per_gene)

