#!usr/bin/Rscript
####
# code by max (8/2014) to run a series of steps in preprocessing gainLoss output to get a suitable
# output for downstream analysis (inference of conditional gains/coevolution).
# this is the first thing to run in my pipeline for inferring gene-gene evolutionary dependencies.
# copyright (C) 2014 Maximilian Press, Elhanan Borenstein

#    This program is free software; you can redistribute it and/or
#    modify it under the terms of the GNU General Public License as
#    published by the Free Software Foundation; either version 2 of the
#    License, or (at your option) any later version.

#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#    General Public License for more details.

#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#    02111-1307, USA.

# On Debian GNU/Linux systems, the complete text of the GNU General
# Public License can be found in `/usr/share/common-licenses/GPL'.
#### 

# in future, hopefully pass the working_dir via command-line args
upper_dir = 'gainLoss_results'
data_dir = commandArgs(trailingOnly = TRUE)[1]
#data_dir = 'MOtree_GLrun'
reconfile = 'AncestralReconstructPosterior.txt'

# this indicates the gainLoss output of a probabilistic ancestral reconstruction- 
# CHANGE THIS FOR ACTUAL- INVERT COMMENTING TO MAKE THE NON-GZ ONE READ. 
reconstruction = file.path(upper_dir,data_dir,reconfile)
#reconstruction = paste(data_dir,'AncestralReconstructPosterior.txt',sep='/')
print('reading')
# handle gzipped files
if (grepl('gz',reconstruction)) {
#	print(paste('gzip -dc ',reconstruction,'> recon.tmp',sep=''))
	system(paste('gzip -dc ',reconstruction,'> recon.tmp',sep=''))
	reconstruction = 'recon.tmp'
	}

# retrieve metadata file
meta = dir(file.path(upper_dir,data_dir),pattern='.meta')
if (length(meta)==0) {
	print('no meta file')
	}
# build a file describing ancestor-descendant relations
library(ape)
tree = read.tree(file.path(upper_dir,data_dir,'TheTree.INodes.ph'))
labels = append(tree$tip.label,tree$node.label)
parentage = tree$edge
parentage[,1] = labels[tree$edge[,1]]
parentage[,2] = labels[tree$edge[,2]]
parentage = gsub('\\[N1\\]','N1',parentage)
write.table(parentage,file.path(upper_dir,data_dir,'TheTree.INodes.ph.parentage'),col.names=FALSE,row.names=FALSE,quote=FALSE)

# run the reconstruction parsing
print('parsing reconstruction')
system(paste('python code/parse_reconstruction_prob_clean.py',reconstruction,file.path(upper_dir,data_dir),meta,sep=' '))
cat('python code/parse_reconstruction_prob_clean.py',reconstruction,file.path(upper_dir,data_dir),meta,'[run]\nyou may see a rm error now which is fine\n',sep=' ')
system('rm recon.tmp')

# try to clear up some memory
rm(labels,tree,parentage)
gc()

print('new reading')
# read in newly inferred matrices
gains = as.matrix(read.table(paste(reconstruction,'gain_probs',sep='.')))
print('and...')
pres = as.matrix(read.table(paste(reconstruction,'pres_probs',sep='.')))


print('now doing MMM')
# obtain a C_ij matrix via MMM (see methods)
print(dim(pres))
print(dim(gains))
C_ij = t(pres) %*% gains
rownames(C_ij) = colnames(pres)
colnames(C_ij) = colnames(pres)

print('saving data')
# write it because you will want it later
save(C_ij,file=file.path(upper_dir,data_dir,'Cijmat.Rdat'))

# compute gains and prevalence of each gene across the tree
gainsum = colSums(gains)
prevsum = colSums(pres)
params = cbind(prevsum,gainsum)

save(params,file=file.path(upper_dir,data_dir,'kos_gain_prev.Rdat'))
