#!usr/bin/sh
# should reproduce the entire HGT-PGCE analysis in Press et al. 
# when run in the main dir with sufficient memory (lots - 8G maybe)
# i would not just run this script, because it will take many days to finish. 
# specifically the hypothesis tests take forever because they open and close a lot of files,
# so i would consider parallelizing them across the $i chunks. 
echo 'This script will take forever to run if you just run it! I would recommend parallelizing some steps (see readme and comments herein). -MOP'

# first, a dir to hold output
mkdir processed_data/reproduced_analysis

# process raw gainLoss output into usable format
Rscript code/driver_scripts/run_HGT_preprocess.R MOtree_GLrun

# take (previously simulated, exactly matching analysis), make it into null distributions
Rscript code/driver_scripts/run_HGT_simulate_REPRODUCE.R

# use C_ij matrix from first step and null distributions from second step to test for PGCEs
# dataset is split into 14 chunks for convenience- otherwise it is too huge.
# note that run this way, this step will probably take at least a couple of weeks.
for i in {1..14}
	do
	Rscript code/driver_scripts/run_hypotest.R $i processed_data/reproduced_analysis gainLoss_results/MOtree_GLrun 
	done

# put pvals into single file


Rscript -e "library(knitr); spin('code/driver_scripts/run_HGT_datascript.R"




