#!usr/bin/Rscript
# to apply a null distribution to observations of HGT between genes
# max 11/14/13
# fiddled with 11/3/2014 and 11/5/2014 to integrate into overall pipeline 
# and generally clean up
if (length(commandArgs(trailingOnly=TRUE))<4) {
	print('error- script requires command-line arguments!')
	print('specifically, (1) the numeric slice of the data (bin) under analysis, (2) the directory to look in for the null distributions ("full_bin*","nullgenes*",etc.), and (3) the directory to look in for the counts of actual genes ("Cijmat.Rdat").')
	stop()
	}

presbin = commandArgs(trailingOnly=TRUE)[2]	# UNCOMMENT
working_dir = commandArgs(trailingOnly=TRUE)[3]
data_dir = commandArgs(trailingOnly=TRUE)[4]

binfilename = 'full_bin'
# this script prints a lot of garbage- but really helps for tracking!!
print('bin')
print(presbin)
realbinsize=1000

load(file.path(data_dir,'Cijmat.Rdat'))
#C_ij = read.table(gzfile('processed_data/koko_gainpres_070114.txt.gz'),header=T)

print('reading')
presnull = read.table(file.path(working_dir,'nullgenes_for_realgenes_presence_1M.txt'))
gainnull = read.table(file.path(working_dir,'nullgenes_for_realgenes_gain_1M.txt'))
gene_bins = read.table(file.path(working_dir,'real_gene_bins_1M.txt'),header=T)
# NOTE CEILING HERE, CONSERVATIVE
gene_bins[,2] = ceiling(gene_bins[,2])
gene_bins[,3] = ceiling(gene_bins[,3])
#print('done reading')

#print('reformatting')
#### 
# NOTE FLOOR FUNCTION ADDED HERE
####
kobyko = floor(as.matrix(C_ij))	#[gene_bins$Gene,gene_bins$Gene]
#print('extracting')
presbins = unique(gene_bins[,'Pres_bin'])
gainbins = unique(gene_bins[,'Gain_bin'])
#print(gainbins)

# initialize pval matrix - UPPER TAIL ONLY for now 
#print('initializing')
pvals = matrix(rep(1,nrow(presnull)^2),nrow(presnull))
#print('etc')
rownames(pvals) = gene_bins$Gene
colnames(pvals) = gene_bins$Gene

calc_p = function(val,nullmat) {
	return(length(which(nullmat>=val)))
	}

# now go through gene bins
# BECAUSE THIS SCRIPT IS PARALLELIZED, THIS OUTER LOOP IS COMMENTED OUT
# JUST RUN A JOB FOR EACH PRESENCE/PREVALENCE BIN (PRESBIN)
#for (presbin in sort(presbins)) {
#	print('present bin')
#	print(presbin)
	geneinbin = gene_bins[gene_bins[,'Pres_bin']==presbin,]
	
	# narrow output data to only genes in relevant bins
	pvals = pvals[geneinbin[,'Gene'],,drop=FALSE]
	# isolate bin-relevant values to test	
	koko_pres = kobyko[as.vector(geneinbin[,'Gene']),,drop=FALSE]
	
	# now go through gain bins within the presence bin	
	for (gainbin in sort(gainbins)) {	#[1:6]) { # UNCOMMENT
		print('gain bin')
		print(gainbin)
		
		# narrow to pres-bin / gain-bin relevant combinations to test
		geneinbin_gain = gene_bins[gene_bins[,'Gain_bin']==gainbin,]

		# read in null distribution for the pres-bin / gain-bin combo
		filename = paste(binfilename,presbin,gainbin,'txt',sep='.')
		all= as.matrix(read.table(file.path(working_dir,filename),header=T))
		koko_test = koko_pres[,as.vector(geneinbin_gain[,'Gene'])]
		
		# go through gene x gene interactions in each bin
		for (presval in unique(geneinbin[,'prevalence'])) {
			# ASSIGN A NULL TO A VALUE, AND THEN VALUES TO GENES,
			# RATHER THAN FOR EACH GENE SEPARATELY - SHOULD GIVE A SPEEDUP
			presvalgenes = as.vector(geneinbin[geneinbin[,'prevalence']==presval,'Gene'])
			
			# ALL GENES IN VALGENES HAVE EQUAL PREVALENCE (after flooring), SO JUST TAKE THE FIRST
			# AS AN EXAMPLE
			presgene = presvalgenes[1]
			#print(presgene)
			null_pres_genes = presnull[presnull[,1]==presgene,]

			# NOW RESTRICT TO THIS NULL SET JUST ONCE FOR THIS PREVALENCE value
			tonull = all[as.matrix(null_pres_genes)[2:length(null_pres_genes)],]
			
			# SMALL SET OF KOS TO LOOKUP at a time
			koko_presval = koko_test[as.vector(presvalgenes),,drop=FALSE]

			for (gainval in unique(geneinbin_gain[,'gain_num'])) {

				gainvalgenes = as.vector(geneinbin_gain[geneinbin_gain[,'gain_num']==gainval,'Gene'])
				gaingene = gainvalgenes[1]	

				null_gain_genes = gainnull[gainnull[,1]==gaingene,]
				vals = koko_presval[,gainvalgenes,drop=FALSE]
				null = tonull[,as.matrix(null_gain_genes)[2:length(null_gain_genes)]]

				pvals[presvalgenes,gainvalgenes] = apply(vals[presvalgenes,gainvalgenes,drop=FALSE],c(1,2),calc_p,nullmat=null) / length(null)

				}	
			}
			save(pvals,file=file.path(working_dir,paste('sim_null_pvals.Rdat',presbin,sep='.'))) 
		}
#	}
