#' In this script I want to apply the models across every tcga sample. I will look at the association of CNVs and drug sensitivity. In particular I am interested in whether the ERBB2 and lapatinib association is recovered here.

#' Load required libraries and set the root directory.
library("pRRophetic")
library("parallel")
library("gdata")
theRootDir <- "/mnt/data_scratch/finalData/"

#' Define a simple function to turn one tailed p-values into two tailed p-values (e.g. those from a linear regression)
makeOneTail <- function(pvals, betaVals, alternative="greater") 
{
  outVec <- numeric()
  if(alternative=="greater")
  {
    for(i in 1:length(pvals))
    {
      if((betaVals[i] > 0))
	outVec[i] <- pvals[i] / 2
      else
	outVec[i] <- 1 - (pvals[i] / 2)
    }
  }
  else if(alternative == "less")
  {
    for(i in 1:length(pvals))
    {
      if((betaVals[i] < 0))
	outVec[i] <- pvals[i] / 2
      else
	outVec[i] <- 1 - (pvals[i] / 2)
    }
  }
  names(outVec) <- names(pvals)
  return(outVec)
}

#' Load the expression data....
load(file=paste(theRootDir, "/dataIn/tenRuvNewStandardApproach.RData", sep="")) # cancerTypesVec, tenRuvNewStandardApproach
colnames(tenRuvNewStandardApproach) <- gsub(".", "-", colnames(tenRuvNewStandardApproach), fixed=T)

#' Load the cnv data for all of TCGA
diseaseAbbrvs <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "DLBC", "GBM", "HNSC", "KICH", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")
theCnvQuantVecList_mat_list <- list()
tumorSamps_list <- list()
gnamesList <- list()
for(i in 1:length(diseaseAbbrvs))
{
  load(file=paste(theRootDir, "/dataIn/tcga_cnv_subtracted/cnvsMappedToGenes/", diseaseAbbrvs[i], ".RData", sep="")) # theCnvQuantVecList_mat, tumorSamps
  theCnvQuantVecList_mat_list[[i]] <- theCnvQuantVecList_mat
  tumorSamps_list[[i]] <- tumorSamps
  gnamesList[[i]] <- gnamesList
}
bigCnvMat <- do.call(cbind, theCnvQuantVecList_mat_list)

#' Create a vector of the cancer types that corresponds to the CNV data. I will use this below.
canTypesVec <- character()
for(i in 1:length(diseaseAbbrvs))
{
  thisTypeVec <- rep(diseaseAbbrvs[i], ncol(theCnvQuantVecList_mat_list[[i]]))
  canTypesVec <- c(canTypesVec, thisTypeVec)
}
names(canTypesVec) <- colnames(bigCnvMat)

#' From bigCnvMat, I need to find the 01A samples and the non-duplicates. 01a samples are the tumor samples.
bigCnv01A <- which(sapply(strsplit(colnames(bigCnvMat), "-"), function(a)a[4]) == "01A")
bigCnv01AMat <- bigCnvMat[, bigCnv01A]
canTypesVec01A <- canTypesVec[bigCnv01A]
patBigCnv <- sapply(strsplit(colnames(bigCnv01AMat), "-"), function(a)a[3])
colnames(bigCnv01AMat) <- patBigCnv # change the colum name to be the patient ids....
names(canTypesVec01A) <- patBigCnv


#' What effect does including cancer type as a co-variate have on the results?

#' Load the drug prediction data and subset to those patients for which we also have CNV data.
load(file=paste(theRootDir, "dataOut/allDrugPredictions_mat.RData", sep="")) # allDrugPredictions_mat, cancerTypesVec
colnames(allDrugPredictions_mat) <- gsub(".", "-", colnames(allDrugPredictions_mat), fixed=T)
all01ASamples <- colnames(allDrugPredictions_mat)[which(sapply(strsplit(colnames(allDrugPredictions_mat), "-"), function(a)a[4]) == "01A")]
all01patientIds <- sapply(strsplit(all01ASamples, "-"), function(a)a[3])
allDrugPredictions_mat_01A <- allDrugPredictions_mat[, all01ASamples]
colnames(allDrugPredictions_mat_01A) <- all01patientIds
commonPats <- patBigCnv[patBigCnv %in% all01patientIds] # patients common to both cnv and prediction data....
bigCnvMatCommonPats <- bigCnv01AMat[, commonPats]
canTypesVec01A_com <- canTypesVec01A[commonPats]
allDrugPredictions_mat_01A_com <- allDrugPredictions_mat_01A[, commonPats]

#' Run an analysis for ERBB2 amplification against drug sensitivty, with and without cancer type included as a factor.
pVals <- numeric()
betaVal <- numeric()
pVals_withType <- numeric()
betaVal_withType <- numeric()
erbb2Amp <- as.numeric(bigCnvMatCommonPats["ERBB2",] > 1)
for(i in 1:nrow(allDrugPredictions_mat_01A))
{
  # control for cancer type
  pVals_withType[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp+factor(canTypesVec01A_com))))[2,4]
  betaVal_withType[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp+factor(canTypesVec01A_com))))[2,1]
  
  # don't control for cancer type
  pVals[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp)))[2,4]
  betaVal[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp)))[2,1]
  
  # print(i)
}
names(pVals) <- rownames(allDrugPredictions_mat_01A)
names(pVals_withType) <- rownames(allDrugPredictions_mat_01A)

#' Print some of the top P-values for lapatinib and ERBB2, with and without controlling for cancer type. These are one tailed statisitcs as we are asking what is more likely to be effective in ERBB2 positive cancers.
psOne_withType <- makeOneTail(pVals_withType, betaVal_withType, alternative="less")
psOne <- makeOneTail(pVals, betaVal, alternative="less")
print(sort(psOne_withType)[1:10])
print(sort(psOne)[1:20])

svg(paste(theRootDir, "figures/lapatinib_noCorrection.svg", sep=""), width=4, height=4)
hist(-log10(psOne), breaks=100, col="#8dd3c7", main="Lapatinib", las=1, xlab=expression("-Log"[10]*"P-value"), cex.axis=0.75)
abline(v=-log10(psOne["Lapatinib"]), col="red")
dev.off()

svg(paste(theRootDir, "figures/lapatinib_corrForType.svg", sep=""), width=4, height=4)
hist(-log10(psOne_withType), breaks=100, col="#8dd3c7", main="Lapatinib", las=1, xlab=expression("-Log"[10]*"P-value"), cex.axis=0.75)
abline(v=-log10(psOne_withType["Lapatinib"]), col="red")
dev.off()

#' Test every drug against EVERY CNV, while controlling for cancer type. I will parallelize this over 4 cores. NB, you may want to reduce that number based on the number of available cores.
bigCnvMatCommonPats_amps <- apply(bigCnvMatCommonPats, 2, function(theCol)return(as.numeric(theCol > 1)))
rownames(bigCnvMatCommonPats_amps) <- rownames(bigCnvMatCommonPats)
bigCnvMatCommonPats_dels <- apply(bigCnvMatCommonPats, 2, function(theCol)return(as.numeric(theCol < -1)))
rownames(bigCnvMatCommonPats_dels) <- rownames(bigCnvMatCommonPats)
theFun <- function(j)
{
  pVals <- numeric()
  betaVal <- numeric()
  
  if(sum(na.omit(bigCnvMatCommonPats_amps[j, ])) > 50) # make sure the gene is amplifed at least 20 times
  {
    for(i in 1:nrow(allDrugPredictions_mat_01A_com))
    {
      theMod <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~bigCnvMatCommonPats_amps[j, ]+factor(canTypesVec01A_com))))
      pVals[i] <- theMod[2,4]
      betaVal[i] <- theMod[2,1]
    }
    names(pVals) <- rownames(allDrugPredictions_mat_01A_com)
    names(betaVal) <- rownames(allDrugPredictions_mat_01A_com)
    
  }
  return(list(pVals, betaVal))
}

allCors <- mclapply(1:nrow(bigCnvMatCommonPats), theFun, mc.cores=4) # NB: You may want to change this number based on the number of available cores.
names(allCors) <- rownames(bigCnvMatCommonPats)
hasAmps <- apply(bigCnvMatCommonPats_amps, 1, function(theRow)return(sum(na.omit(theRow)) > 50)) # restrict analysis to CNAs that occur in 50 or more samples.
allCors_hasAmps <- allCors[hasAmps]
pVals <- sapply(allCors_hasAmps, function(item)return(item[[1]]))
betas <- sapply(allCors_hasAmps, function(item)return(item[[2]]))

print(sort(unmatrix(pVals))[1:10]) # its going to be difficult to get at causality in a systematic way here....
print(pVals["Lapatinib", "ERBB2"])

#' Lets also incorporate glds
drugRelatedness <- read.csv(paste(theRootDir, "dataIn/categorys.csv", sep=""), as.is=TRUE)
theseDrugNames <- rownames(allDrugPredictions_mat_01A_com)
drugRelatedness[, "theseDrugNames"] <- unlist(strsplit(drugRelatedness[, "DrugNamesOtherFile"], "_IC_50"))
pairCor <- cor(t(allDrugPredictions_mat_01A_com), method="spearman")
controlPcsList <- list()
for(j in 1:nrow(allDrugPredictions_mat_01A_com))
{
  categoryThisDrug <- drugRelatedness[, "Drug.Category"][which(drugRelatedness["theseDrugNames"] == rownames(allDrugPredictions_mat_01A_com)[j])]
  negControlDrugs <- na.omit(drugRelatedness[!drugRelatedness[, "Drug.Category"] %in% categoryThisDrug, "theseDrugNames"])
  pairwiseCorNear <- names(rank(abs(pairCor[, colnames(t(allDrugPredictions_mat_01A_com))[j]]))[118:137]) ## NB also remove very correlated drugs... 
  negControlDrugs <- setdiff(negControlDrugs, pairwiseCorNear) # remove very highly correlated drugs from "negative controls"
  controlPCsAll <- prcomp(t(allDrugPredictions_mat_01A_com)[, negControlDrugs])$x
  controlPcsList[[j]] <- controlPCsAll
}

#' Run across all CNVs and all drugs with GLDS.....
theFun <- function(j)
{
  pVals <- numeric()
  betaVal <- numeric()
  
  if(sum(na.omit(bigCnvMatCommonPats_amps[j, ])) > 50) # make sure the gene is amplifed at least 20 times
  {
    for(i in 1:nrow(allDrugPredictions_mat_01A_com))
    {
      theMod <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~bigCnvMatCommonPats_amps[j, ]+factor(canTypesVec01A_com)+controlPcsList[[i]][, 1:50]))) # as with the mutation data, control for 50 pcs
      pVals[i] <- theMod[2,4]
      betaVal[i] <- theMod[2,1]
    }
    names(pVals) <- rownames(allDrugPredictions_mat_01A_com)
    names(betaVal) <- rownames(allDrugPredictions_mat_01A_com)    
  }
  # print(j)
  return(list(pVals, betaVal))
}
allCors <- mclapply(1:nrow(bigCnvMatCommonPats), theFun, mc.cores=4)
names(allCors) <- rownames(bigCnvMatCommonPats)
hasAmps <- apply(bigCnvMatCommonPats_amps, 1, function(theRow)return(sum(na.omit(theRow)) > 50))
allCors_hasAmps <- allCors[hasAmps]
pVals <- sapply(allCors_hasAmps, function(item)return(item[[1]]))
betas <- sapply(allCors_hasAmps, function(item)return(item[[2]]))

#' Print some of the top results and the results for lapatinb and erbb2. Its going to be difficult to get at causality in a systematic way here. Something for future work.
print(sort(unmatrix(pVals))[1:10])










