library(evemodel)
library(ape)
library(optparse)
library(ggplot2)
library(qvalue)
#library(mgcViz) may need to be installed for the qqplots

option_list = list(
  make_option(c("-i", "--infile"), type="character", default="../../dataset_s2/reads_density/Promoters_Heart_fpkm_normalized.csv", 
              help="Input tab-delimiter file with replicates as columns and regions/genes as rows. [default= %default]"),
  make_option(c("-s", "--species_tree"), type="character", default="data/nmrdmr_sptree.nwk", 
              help="Newick species tree [default= %default]"),
  make_option(c("-b", "--branch"), type="character", default="anc_mr", 
              help="Foreground branch to test for shift, one of: 'anc_mr', 'hgla', 'fdam' or 'cpor' [default= %default]"),
  make_option(c("-o", "--outdir"), type="character", default="eve_twoTheta_test/", 
              help="Output directory [default= %default]")
);

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

outDir <- opt$outdir
dir.create(outDir, showWarnings = FALSE)

branch <- opt$branch

### LOAD INPUT DATA ###

# load read density in orthologous regulatory elements
regDF <- read.delim(opt$infile)
regMat <- as.matrix(regDF)
rownames(regMat) <- rownames(regDF)

# remove the trailing number so that we get a vector with the species for each column
colSpecies <- sub("_.*$","",colnames(regMat))

# load and plot the species tree
speciesTree <- read.tree(opt$species_tree)
pdf(paste(outDir, "sptree.pdf", sep=''))
plot(speciesTree)
add.scale.bar(x=0, y=1, length = 0.01)
dev.off()


### RUN EVE TWOTHETA TEST ###

# Configure foreground branch
# !! hard-coded to detect shifts on the selected branch, works only for this specific species tree!!
if (branch == "anc_mr"){
	isTheta2edge = c(0, 1, 1, 1, 0, 0) 
} else if (branch == "hgla"){
	isTheta2edge = c(0, 0, 0, 1, 0, 0)
}  else if (branch == "fdam"){
	isTheta2edge = c(0, 0, 1, 0, 0, 0)
}  else if (branch == "cpor"){
  isTheta2edge = c(0, 0, 0, 0, 1, 0)
} else {
	print("Error, --branch should be one of 'anc_mr', 'hgla', 'fdam', 'cpor'. Exiting.")
	quit(status=1)
}

### RUN EVE TWOTHETA TEST TO IDENTIFY REGULATORY ACTIVITY SHIFTS ON THE SELECTED BRANCH ###

res <-twoThetaTest(tree=speciesTree, gene.data=regMat, isTheta2edge, colSpecies=colSpecies)

# QQplot shows a departure of LRT from the chi2 distribution
pdf(paste(outDir, branch, '_', tools::file_path_sans_ext(basename(opt$infile)), "_LRT_qqplot_chi2_vs_data.pdf", sep=''))
qqplot(qchisq(ppoints(1000), df = 1), res$LRT) + geom_abline(intercept = 0, slope=1, linetype="dotted", color = "red")
dev.off()


### RUN EVE TWOTHETA TEST ON NULL-SIMULATED DATA USING MEDIAN PARAMETERS ESTIMATED FROM THE DATA ###

# median paramters of the null model estimated from the data
params <- apply(res$oneThetaRes$par, 2, median)
print(params)

# run null simulations
set.seed(123)
simData <- simOneTheta(n=1000, tree=speciesTree, colSpecies = colSpecies, theta=params["theta"],
	                   sigma2 = params["sigma2"], alpha = params["alpha"], beta=params["beta"])

# run the evemodel to test for regulatory activity shift on the selected branch(es)
resSim <-twoThetaTest(tree = speciesTree, gene.data = simData, isTheta2edge, colSpecies = colSpecies)

# QQplot shows a departure of LRT from the chi2 distribution
pdf(paste(outDir, branch, '_', tools::file_path_sans_ext(basename(opt$infile)), "_LRT_qqplot_chi2_vs_sim.pdf", sep=''))
qqplot(qchisq(ppoints(1000), df = 1), resSim$LRT) + geom_abline(intercept = 0, slope=1, linetype="dotted", color = "red")
dev.off()

# QQplot shows similar distribution for the LRT of the TwoThetaTest between simulated and observed data
pdf(paste(outDir, branch, '_', tools::file_path_sans_ext(basename(opt$infile)), "_LRT_qqplot_data_vs_sim.pdf", sep=''))
qqplot(resSim$LRT, res$LRT) + geom_abline(intercept = 0, slope=1, linetype="dotted", color = "red")
dev.off()


# Compute empirical p-values from simulation and write the TwoThetaTest results to file
element.id <- rownames(regMat)
shift <- res$twoThetaRes$par[,"theta2"] - res$twoThetaRes$par[,"theta1"]
empirical.p.value <- empPvals(res$LRT, resSim$LRT)
df.final <- data.frame(element.id, shift, empirical.p.value)
write.table(df.final, paste(outDir, branch, '_', tools::file_path_sans_ext(basename(opt$infile)), "_eve_results.csv", sep=''), row.names = FALSE, sep='\t', quote = FALSE)

# Save null distribution of LRT
LRT <- resSim$LRT
shift <- resSim$twoThetaRes$par[,"theta2"] - resSim$twoThetaRes$par[,"theta1"]
df.nullsim <- data.frame(shift, LRT)
write.table(df.nullsim, paste(outDir, branch, '_', tools::file_path_sans_ext(basename(opt$infile)), "_null_LRT.csv", sep=''), row.names = FALSE, sep='\t', quote = FALSE)