#!usr/bin/Rscript
# to compute power for some null distribution, based on expected distribution of genes along both parameters.
# cannibalized from a hypotest_GL_binwise_byval_033014.R function on 05/28/14.
# also prints the number of observations in each case to a file?



binfilename = 'full_bin'
realbinsize=1000

# source files
print('reading 1')


trim <- function (x) gsub("^\\s+|\\s+$", "", x)

print('reading 2')
presnull = as.matrix(read.table('processed_data/reproduced_analysis/nullgenes_for_realgenes_presence_1M.txt'))
print('reading 3')
gainnull = as.matrix(read.table('processed_data/reproduced_analysis/nullgenes_for_realgenes_gain_1M.txt'))
print('reading 4')
gene_bins = trim(as.matrix(read.table('processed_data/reproduced_analysis/real_gene_bins_1M.txt',header=T)))
print('done reading')

gene_bins[,2] = floor(as.numeric(gene_bins[,2]))
gene_bins[,3] = floor(as.numeric(gene_bins[,3]))

print('reformatting')
print('extracting')
presbins = gsub(' ','',unique(gene_bins[,'Pres_bin']))
gainbins = gsub(' ','',unique(gene_bins[,'Gain_bin']))

# initialize pval matrix - UPPER TAIL ONLY for now 
print('initializing')
powerps = matrix(rep(1,length(unique(floor(as.numeric(gene_bins[,'prevalence']))))*length(unique(floor(as.numeric(gene_bins[,'gain_num']))))),ncol=length(unique(floor(as.numeric(gene_bins[,'gain_num'])))))

rownames(powerps) = unique(floor(as.numeric(gene_bins[,'prevalence'])))
colnames(powerps) = unique(floor(as.numeric(gene_bins[,'gain_num'])))

paircounts= powerps

calc_p = function(val,nullmat) {
	return(length(which(nullmat>=val)))
	}

# now go through gene bins
for (presbin in sort(presbins)) {
	print('present bin')
	print(presbin)
	geneinbin = gene_bins[gene_bins[,'Pres_bin']==presbin,]
	
	# isolate bin-relevant values to test	
	
	# now go through gain bins within the presence bin	
	for (gainbin in sort(gainbins)) { # UNCOMMENT
		print('gain bin')
		print(gainbin)
		
		# narrow to pres-bin / gain-bin relevant combinations to test
		geneinbin_gain = gene_bins[gene_bins[,'Gain_bin']==gainbin,]

		# read in null distribution for the pres-bin / gain-bin combo
		filename = file.path('processed_data/reproduced_analysis',paste(binfilename,presbin,gainbin,'txt',sep='.'))
		all= as.matrix(read.table(filename,header=T))
		# go through gene x gene interactions in each bin
		
		for (presval in unique(as.numeric(geneinbin[,'prevalence']))) {
			# NOVELTY: ASSIGN A NULL TO A VALUE, AND THEN VALUES TO GENES,
			# RATHER THAN FOR EACH GENE SEPARATELY - SHOULD GIVE A SPEEDUP
			presvalgenes = as.vector(geneinbin[geneinbin[,'prevalence']==presval,'Gene'])
			if (length(presvalgenes)==0) {
				next
				}		
			# ALL GENES IN VALGENES HAVE EQUAL PREVALENCE, SO JUST TAKE THE FIRST
			# AS AN EXAMPLE
			presgene = presvalgenes[1]
#			print('by val gene')
#			print(presgene)
			null_pres_genes = presnull[presnull[,1]==presgene,]

			# NOW RESTRICT TO THIS NULL SET JUST ONCE FOR THIS PREVALENCE
			tonull = all[as.matrix(null_pres_genes)[2:length(null_pres_genes)],]
			
			# SMALL SET OF KOS TO LOOKUP
			for (gainval in unique(floor(as.numeric(geneinbin_gain[,'gain_num'])))) {
#				print('gaining')
				gainvalgenes = as.vector(geneinbin_gain[geneinbin_gain[,'gain_num']==gainval,'Gene'])
				if ( length(gainvalgenes) == 0 ) {
					next
					}

				gaingene = gainvalgenes[1]	

				null_gain_genes = gainnull[gainnull[,1]==gaingene,]
				
				maxval = min(c(presval,gainval))

				null = tonull[,as.matrix(null_gain_genes)[2:length(null_gain_genes)]]

				powerps[paste(presval),paste(gainval)] = calc_p(maxval,null) / length(null)
				paircounts[paste(presval),paste(gainval)] = length(presvalgenes)*length(gainvalgenes)	
				}	
			}
		}
	}

write.table(powerps,paste('processed_data/sim_null_powerps.txt'),quote=FALSE)

write.table(paircounts,paste('processed_data/sim_null_paircounts.txt'),quote=FALSE)
