#!usr/bin/Rscript

####
# 8/2014: code to simulate gain/loss evolution of genes based on gain/loss rates derived from gainLoss.

###################
# 10/22/14 issues: need to fix parsing of simulated gene filenames
# for proper interpretation in MMM loop- currently ends files with 
# ".NULL.NULL" which is bad.
# also haven't tested that the working_dir is functioning properly
# HAVE NOT harmonized the specified data_dir as a place to look for
# presence and gain files for real gene reconstruction, currently
# hard-coded to my example.
# CHANGE THIS
# gains_prev file is also currently hard-coded weirdly. REDO IT!!
#
# 6/2015: all issues resolved, leaving above as example+historical document
###################

require(ape)
require(MASS)
#args <- commandArgs(trailingOnly = TRUE)
#realpresfile = args[1]
#realgainfile = args[2]
#working_dir = args[3]
#data_dir = args[4]

# data_dir is the source of real data, no new data will be saved there.
# data_dir = 'MOtree_GLrun'
data_dir = commandArgs(trailing=TRUE)

# THE FOLLOWING SHOULD NOW BE DONE WITH COMMAND-LINE ARGS EXCEPT FOR TESTING!!
### CHANGE these depending on what reconstruction you are using
realpresfile = file.path(data_dir,dir(data_dir,pattern='pres_probs')) 
realgainfile = file.path(data_dir,dir(data_dir,pattern='gain_probs'))

print(realpresfile)
print(realgainfile)

# give these a more meaningful naming when i have got one
#realgainfile = paste(data_dir,'021214_MOtree_gains.txt.gz')
#realpresfile = paste(data_dir,'021214_MOtree_pres.txt.gz') 

# doing this now for simplicity!!! hopefully no confusion will result
# working_dir is where files will be saved and processed
working_dir = file.path(data_dir,'null_simed_genes_new',sep='')
cat('working dir is ',working_dir,'data dir is ',data_dir,'\n')
# say how many genes you want to simulate
# should be multiple of 10,000
simgene_num = 100000

# from run_HGT_preprocess.R 
###
# make sure that there is only the file that you are interested in with this ending!!!
# NEED TO CHANGE THIS!!!!!!!
#gainsprev_file = ls(pattern='_gain_prev.Rdat')

# read in gain/loss rates
# NOTE: THIS IS DONE DOWNSTREAM
#gainrates = read.table('MOtree_GLrun/gain4site.txt',header=T)
#lossrates = read.table('MOtree_GLrun/loss4site.txt',header=T)

# now run through simulator as often as needed
system(paste('mkdir',working_dir))
for (i in 1:ceiling(simgene_num / 10000)) {
	cat('simulating replicate',i,'X 10,000 genes\n')
	
	# modify script to save outputs from overwriting
	system(paste("sed s/'REPLICATE'/",i,"/g < code/simulate_MOtree_genes.R > code/simulate_genes_tmp.R",sep=''))

	# actually run simulator
	source('code/simulate_genes_tmp.R')
	}

# labeling function for simed genes
makelabels = function(pos,filestring) {
	filestring = gsub('_presence','',filestring)
	filestring = gsub('_gain','',filestring)
	return(paste(filestring,pos,sep='.'))
	}

# perform long list of matrix multiplications, save to files.
# note that multiplied files are labeled XXXX.i.j, where i is the source file for the row genes,
# whereas j is the source file for the col genes.

# as part of this step, compute overall counts of presence/gain of sim'ed genes.

print('matrix-matrix multiplications (for C_null)')
presmats = dir(path=working_dir,pattern='apesim_presence')
gainmats = dir(path=working_dir,pattern='apesim_gain')

# initialize counters
countedpres = c()
countedgain = c()

# actual loop- resource-intensive
for (mati in presmats) {
	# load the presence data
	load(paste(working_dir,mati,sep='/'))
	print(mati)
	# strsplit returns a list of length 1 where entry is vector of split strings
	presnum = strsplit(mati,'\\.')[[1]][3]
	rownames(present) = sapply(1:nrow(present),makelabels,filestring=mati)
	countedpres[rownames(present)] = rowSums(present)
	# load the gains data	
	for (matj in gainmats) {
		# load gain data
		load(paste(working_dir,matj,sep='/'))
		# strsplit returns a list of length 1 where entry is vector of split strings
		gainnum = strsplit(matj,'\\.')[[1]][3]
		print(matj)
		# label the genes
		rownames(gain) = sapply(1:nrow(gain),makelabels,filestring=matj)	
		countedgain[rownames(gain)] = rowSums(gain)	
	
		# do and save the multiplication
		# note that mats are flipped from real data (t() other mat)
		mult = present %*% t(gain)
		stopifnot(length(which(is.na(mult)))==0)
		save(mult,file=paste(paste(working_dir,'simgenes_presgain_mult',sep='/'),presnum,gainnum,sep='.'))
		# just try to get this out of memory
		mult = c()
		}
	}


# need the full values of each simulated gene for each parameter
# make it into one object to work in the assignment code
counts = cbind(countedpres[names(countedpres)],countedgain[names(countedpres)])
rownames(counts) = names(countedpres)
colnames(counts) = c('countedpres','countedgain')
remove(countedpres)
remove(countedgain)

# read in ancestral reconstructions for REAL data
print('reading in actual data')
pres = read.table(realpresfile,header=T)
gain = read.table(realgainfile,header=T)

# assign to bin files 
print('assigning simulated genes to individual null distributions for real genes (running external script)')
source('code/assign_genes_to_simbins.R')

# sort sim'ed genes into bin files according to their parameters (gain/prevalence)
print('sorting simulated genes into null distribution files (running external script)')
source('code/simsort_and_bin.R')

# after these steps are done, you can run the hypothesis tests. 
