library(evemodel)
library(ape)
library(optparse)
library(ggplot2)
library(qvalue)
#library(mgcViz)

option_list = list(
  make_option(c("-i", "--infile"), type="character", default="../../dataset_s2/reads_density/Promoters_Heart_fpkm_normalized.csv", 
              help="Input tab-delimiter file with replicates as columns and regions/genes as rows. [default= %default]"),
  make_option(c("-s", "--species_tree"), type="character", default="data/nmrdmr_sptree.nwk", 
              help="Newick species tree [default= %default]"),
  make_option(c("-b", "--branch"), type="character", default="anc_mr", 
              help="Foreground branch to test for shift, one of: 'anc_mr', 'hgla' or 'fdam' [default= %default]"),
  make_option(c("-o", "--outdir"), type="character", default="eve_twoTheta_test/", 
              help="Output directory [default= %default]")
);


opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

outDir <- opt$outdir
dir.create(outDir, showWarnings = FALSE)

branch <- opt$branch


### LOAD INPUT DATA ###

# load read density in orthologous regulatory elements
regDF <- read.delim(opt$infile)
regMat <- as.matrix(regDF)
rownames(regMat) <- rownames(regDF)

# remove the trailing number so that we get a vector with the species for each column
colSpecies <- sub("_.*$","",colnames(regMat))

# load the species tree
speciesTree <- read.tree(opt$species_tree)


# Configure foreground branch
# !! hard-coded to detect shifts on the selected branch, works only for this specific species tree!!
if (branch == "anc_mr"){
  isTheta2edge = c(0, 1, 1, 1, 0, 0) 
} else if (branch == "hgla"){
  isTheta2edge = c(0, 0, 0, 1, 0, 0)
}  else if (branch == "dmr"){
  isTheta2edge = c(0, 0, 1, 0, 0, 0)
} else {
  print("Error, --branch should be one of 'anc_mr', 'hgla', 'fdam'. Exiting.")
  quit(status=1)
}

#Load LRT from null simulations to comupute the p-values for simulations below
simlrt = paste(outDir, branch, '_', tools::file_path_sans_ext(basename(opt$infile)), "_null_LRT.csv", sep='')
sim.for.pval <- read.table(simlrt, header=TRUE)


### HARD-CODED parameters for simulations (median params from the data) ###

if (opt$infile == "../../dataset_s2/reads_density/Enhancers_Liver_fpkm_normalized.csv")
{
  t = 1.2397373
  s = 159.1669648
  a = 122.3232240
  b = 0.9482376
}


if (opt$infile == "../../dataset_s2/reads_density/Enhancers_Heart_fpkm_normalized.csv")
{
  t = 1.610017  
  s = 96.709034 
  a = 109.056712
  b = 1.523757
}

if (opt$infile == "../../dataset_s2/reads_density/Promoters_Heart_fpkm_normalized.csv")
{
  t = 2.694493 
  s = 62.163889
  a = 106.741894
  b = 2.205785
}

if (opt$infile == "../../dataset_s2/reads_density/Promoters_Liver_fpkm_normalized.csv")
{
  t = 2.851738     
  s = 106.617073
  a = 117.530997
  b = 1.240002
}


### RUN SIMULATIONS WITH VARYING PROPORTIONS OF NULLS AND POSITIVES ###

set.seed(456)

for (p in c(0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20)){
  print(paste("Simulation with ", p*100, "% shifted elements", sep=''))
  shift.sim <- rbeta(1000*p, 8, 2)*3
  
  simDataF <- simOneTheta(n=1000*(1-p), tree=speciesTree, colSpecies = colSpecies, theta=t, sigma2 = s, alpha = a, beta=b)
  resSimF <- twoThetaTest(tree = speciesTree, gene.data = simDataF, isTheta2edge,
                          colSpecies = colSpecies)
  datalist = list()
  for (i in seq(1, length(shift.sim))){
    simD <- simTwoTheta(n=1, tree=speciesTree, colSpecies = colSpecies, theta1=t, sigma2 = s, alpha = a, beta=b, isTheta2edge=isTheta2edge, theta2=t+shift.sim[i])
    datalist[[i]] <- simD
  }
  simDataT <- do.call(rbind, datalist)
  resSimT <- twoThetaTest(tree = speciesTree, gene.data = simDataT, isTheta2edge,
                          colSpecies = colSpecies)
  
  pvalF <- empPvals(resSimF$LRT, sim.for.pval$LRT)
  pvalT <- empPvals(resSimT$LRT,  sim.for.pval$LRT)
  
  shiftsF <- resSimF$twoThetaRes$par[,"theta2"] - resSimF$twoThetaRes$par[,"theta1"]
  shiftsT <- resSimT$twoThetaRes$par[,"theta2"] - resSimT$twoThetaRes$par[,"theta1"]
  
  betaF2 <- resSimF$twoThetaRes$par[,"beta"]
  betaT2 <- resSimT$twoThetaRes$par[,"beta"]
  
  sigma2F2 <- resSimF$twoThetaRes$par[,"sigma2"]
  sigma2T2 <- resSimT$twoThetaRes$par[,"sigma2"]
  
  alphaF2 <- resSimF$twoThetaRes$par[,"alpha"]
  alphaT2 <- resSimT$twoThetaRes$par[,"alpha"]
  
  betaF1 <- resSimF$oneThetaRes$par[,"beta"]
  betaT1 <- resSimT$oneThetaRes$par[,"beta"]
  
  sigma2F1 <- resSimF$oneThetaRes$par[,"sigma2"]
  sigma2T1 <- resSimT$oneThetaRes$par[,"sigma2"]
  
  alphaF1 <- resSimF$oneThetaRes$par[,"alpha"]
  alphaT1 <- resSimT$oneThetaRes$par[,"alpha"]
  
  labels <- c(replicate(1000*(1-p), 0), replicate(1000*p, 1))
  pval <- c(pvalF, pvalT)
  estimated.shift <- c(shiftsF, shiftsT)
  estimated.sigma2 <- c(sigma2F1, sigma2T1, sigma2F2, sigma2T2 )
  estimated.alpha <- c(alphaF1, alphaT1, alphaF2, alphaT2)
  estimated.beta <- c(betaF1, betaT1, betaF2, betaT2)
  
  df <- data.frame(labels, pval, estimated.shift, estimated.sigma2, estimated.alpha, estimated.beta)
  
  write.table(df, paste(outDir, tools::file_path_sans_ext(basename(opt$infile)),"_shift_beta_", p, "shifted_eve_Sim_all_params_", branch, ".csv", sep=''), row.names = FALSE, sep='\t', quote = FALSE)
}