#' In this script we will investigate the association between Drug sensitivity and CNV status in the TCGA breast cancer samples....

#' Set the root directory to the location of the data. This must be modifed for your own use (based on the location of the files).
theRootDir <- "/mnt/data_scratch/finalData/"

#' 1st try this on breast cancer samples, predicting from allSolidTumors
library("pRRophetic")

#' Load BRCA CNV and IC50. The CNV data was created in "map_cnvs_to_genes.R", the expression data in "batch_correct_tcga_data.R" and the drug predictions in "getPredsOnAllTCGA_batchCorrData.R"
load(paste(theRootDir, "dataIn/tcga_cnv_subtracted/cnvsMappedToGenes/BRCA.RData", sep="")) # theCnvQuantVecList_mat, tumorSamps
load(file=paste(theRootDir, "/dataIn/tenRuvNewStandardApproach.RData", sep="")) # cancerTypesVec, tenRuvNewStandardApproach
load(paste(theRootDir, "dataOut/allDrugPredictions_mat.RData", sep="")) # allDrugPredictions_mat, cancerTypesVec
colnames(allDrugPredictions_mat) <- gsub(".", "-", colnames(allDrugPredictions_mat), fixed=T)


#' Load the breast cancer CNV data and find samples for which we also have drug sensitivty predictions.
diseaseAbbrvs <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "DLBC", "GBM", "HNSC", "KICH", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")
tumSampCnvMat <- theCnvQuantVecList_mat[, tumorSamps]
oLapSamps <- colnames(allDrugPredictions_mat)[colnames(allDrugPredictions_mat) %in% colnames(tumSampCnvMat)] # these do not match

#' Sample names do not match between the CNV and expression data, thus I need to match the IDs of the participants.
patPred <- sapply(strsplit(colnames(allDrugPredictions_mat), "-"), function(a)a[3]) # get the patient id from the drug predictions
pat01A <- which(sapply(strsplit(colnames(allDrugPredictions_mat), "-"), function(a)a[4]) == "01A") # get the "01A" samples, these are the tumor samples and the A means the first replicate (if there are replicates).
patCnv <- sapply(strsplit(colnames(tumSampCnvMat), "-"), function(a)a[3]) # get the patient IDs for these breast cancer CNV samples.
colnames(tumSampCnvMat) <- patCnv
relevantPatients <- patCnv[patCnv %in% intersect(patPred[pat01A], patPred[cancerTypesVec == "BRCA"])] # 01A breast cancer patients for whom we have both CNV and drug predictions

#' Get the predictions for Laptinib for only breast cancer patients, who are "01A" and have matched CNV data.
brca01aPreds <- allDrugPredictions_mat[, intersect(pat01A, which(cancerTypesVec == "BRCA"))]
brca01aPreds_patNames <- sapply(strsplit(colnames(brca01aPreds), "-"), function(a)a[3])
colnames(brca01aPreds) <- brca01aPreds_patNames
brca_preds_with_cnvs <- brca01aPreds["Lapatinib", relevantPatients]


#' Now run t-tests and wilcoxon rank sum tests for every amplification, can the ERBB2 amplification be identified from these data?
pValsOut <- numeric()
pValsLm <- numeric()
tTestAmp <- numeric()
tTestDir <- numeric()
pCondOnErbb2 <- numeric()
wilcoxP <- numeric()
wilcoxDir <- numeric()
nAmp <- numeric()
nNotAmp <- numeric()
op <- options(warn = (-1)) # suppress warnings 
for(i in 1:nrow(tumSampCnvMat))
{
  amp <- which(tumSampCnvMat[i, relevantPatients] > 1)
  notAmp <- which(tumSampCnvMat[i, relevantPatients] < 1)
  nAmp[i] <- length(amp)
  nNotAmp[i] <- length(notAmp)
  
  erbb2NotAmp <- which(tumSampCnvMat["ERBB2", relevantPatients][c(amp, notAmp)] < 1)
  
  if(length(amp) > 10)
  {
    pValsOut[i] <- cor.test(tumSampCnvMat[i, relevantPatients], brca_preds_with_cnvs, method="spearman")$p.value # p-value against continuous CNV. This is likely nonsense.
    pValsLm[i] <- coef(summary(lm(brca_preds_with_cnvs~tumSampCnvMat[i, relevantPatients]+tumSampCnvMat["ERBB2", relevantPatients])))[2,4]
    tTestAmp[i] <- t.test(brca_preds_with_cnvs[relevantPatients][amp], brca_preds_with_cnvs[relevantPatients][notAmp])$p.value # p-value for t-test between cnved and not cnved
    tTestDir[i] <- (mean(brca_preds_with_cnvs[relevantPatients][amp]) - mean(brca_preds_with_cnvs[relevantPatients][notAmp]))
    wilcoxP[i] <- wilcox.test(brca_preds_with_cnvs[relevantPatients][amp], brca_preds_with_cnvs[relevantPatients][notAmp])$p.value
    wilcoxDir[i] <- (median(brca_preds_with_cnvs[relevantPatients][amp]) - median(brca_preds_with_cnvs[relevantPatients][notAmp]))
    resp <- c(brca_preds_with_cnvs[relevantPatients][amp], brca_preds_with_cnvs[relevantPatients][notAmp])
    explor <- c(rep("amp", length(amp)), rep("notAmp", length(notAmp)))
    erbb2Amp <- rep("ampE", length(explor))
    erbb2Amp[erbb2NotAmp] <- "notAmpE"
    pCondOnErbb2[i] <- coef(summary(lm(resp~explor+erbb2Amp)))[2,4]
  }
  else
  {
    pValsOut[i] <- NA
    pValsLm[i] <- NA
    tTestAmp[i] <- NA
    tTestDir[i] <- NA
    wilcoxP[i] <- NA
    wilcoxDir[i] <- NA
    pCondOnErbb2[i] <- NA
  }
}

#' Assign names and calculate q values...
names(pValsOut) <- rownames(tumSampCnvMat)
names(pValsLm) <- rownames(tumSampCnvMat)
qTtest <- p.adjust(tTestAmp, method="BH")
names(tTestAmp) <- rownames(tumSampCnvMat)
names(tTestDir) <- rownames(tumSampCnvMat)

#' Create a table of some of the top results, we would expect ERBB2 to be near the top of this list.
resMat <- cbind(tTestAmp[order(tTestAmp)][1:40], tTestDir[order(tTestAmp)][1:40], qTtest[order(tTestAmp)][1:40], nAmp[order(tTestAmp)][1:40], nNotAmp[order(tTestAmp)][1:40])
colnames(resMat) <- c("PvalTests", "betaTtest", "qvalTtest", "numAmp", "numNotAmp")
print(resMat)
genes <- rownames(resMat)

#' Show the top results if we use a Wilconxon Rank sum test instead: They are more or less the same.
names(wilcoxDir) <- rownames(tumSampCnvMat)
names(wilcoxP) <- rownames(tumSampCnvMat)
print(cbind(wilcoxP[order(wilcoxP)][1:20], wilcoxDir[order(wilcoxP)][1:20]))

#' We have also created a set of results when we condition on ERBB2. When we do this, we identify the secondary drug target EGFR. We also identify a number of ABC transporters, which are known to be involved in multidrug resistance....
names(pCondOnErbb2) <- rownames(tumSampCnvMat)
print(sort(pCondOnErbb2)[1:20])

#' Create a scatter plot of the predicted sensitivity Vs ERBB2 amplification.
svg(paste(theRootDir, "figures/erbb2VsLapatinib.svg", sep=""), width=4, height=4)
plot(tumSampCnvMat["ERBB2", relevantPatients], brca_preds_with_cnvs, pch=20, col="#00000044", xlab="CNV", ylab="Predicted Lapatinib IC50")
dev.off()

#' Create a plot showing that predicted lapatinib sensitivity increases with increasing copy number of ERBB2
splitOnCnvNum <- tumSampCnvMat["ERBB2", relevantPatients]
splitOnCnvNum[splitOnCnvNum < 1] <- 0
splitOnCnvNum[splitOnCnvNum > 1 & splitOnCnvNum < 2] <- 1
splitOnCnvNum[splitOnCnvNum > 2 & splitOnCnvNum < 3] <- 2
splitOnCnvNum[splitOnCnvNum > 3] <- 3
a <- split(brca_preds_with_cnvs, splitOnCnvNum)
names(a) <- c("<1", "1-2", "2-3", ">3")
svg(paste(theRootDir, "figures/erbb2VsLapatinib_bplot.svg", sep=""), width=3, height=4)
boxplot(a, col=c("#eff3ff", "#bdd7e7", "#6baed6", "#2171b5"), ylab="Predicted Lapatinib Sensitivity", xlab="Normalized Copy Number", pch=20, cex.axis=.75, outcol="#00000033")
dev.off()


#' Plot samples that are HER2 amplified or not amplifed against PC1 and PC2. of the corresponding gene expression matrix. This is a supplementary figure.
brca01aExpr <- tenRuvNewStandardApproach[, intersect(pat01A, which(cancerTypesVec == "BRCA"))]
brca01aExpr_patNames <- sapply(strsplit(colnames(brca01aExpr), ".", fixed=T), function(a)a[3])
colnames(brca01aExpr) <- brca01aExpr_patNames
brca_expr_with_cnvs <- brca01aExpr[, relevantPatients]
pcOut <- prcomp(t(brca_expr_with_cnvs))$x
thePcs <- pcOut$x
hasErbb2Amp <- as.numeric(tumSampCnvMat["ERBB2", relevantPatients] > 1)

#' Print the P-values for the association of ERBB2 amplification and of the PCs of the gene expression matrix.
for(i in 1:10)
{
  print(wilcox.test(thePcs[,i][which(hasErbb2Amp == 1)], thePcs[,i][which(hasErbb2Amp == 0)])$p.value)
}

#' Make a plot for the strongest associated PCs, that is PCs 4 and 5.
thecols <- hasErbb2Amp
thecols[thecols == 1] <- "#377eb899"
thecols[thecols == 0] <- "#4daf4a99"
svg(paste(theRootDir, "figures/erbb2AgainstExpressionPcs.svg", sep=""), width=3, height=4)
plot(thePcs[,4], thePcs[,5], col=thecols, pch=1, xlab="PC4", ylab="PC5", cex.axis=.7, las=1)
legend("bottomleft", inset=.05, title="HER2 Status", c("Amplified","Not Amplified"), fill=c("#377eb899", "#4daf4a99"), cex=0.5)
dev.off()


#' Plot the effect size in the region around the ERBB2 locus: Using this approach we can identify ERBB2 as the causitive gene in this locus.
# First load the GRanges object that was used in the creation of the CNV -> gene mapping, i.e. TxDb.Hsapiens.UCSC.hg19.knownGene
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
library(GenomicFeatures)
geneRanges <- genes(txdb)
library(org.Hs.eg.db)
e2s = toTable(org.Hs.egSYMBOL)
syms <- e2s[, "symbol"]
names(syms) <- e2s[, "gene_id"]
mcols(geneRanges)$gene_sym <- syms[as.character(mcols(geneRanges)$gene_id)]
save(geneRanges, file=paste(theRootDir, "dataIn/geneRangesHg19.RData", sep=""))


theDf <- as.data.frame(geneRanges)
theDf_filt <- theDf[!is.na(theDf$gene_sym), ] # remove the rows with nas for gene symbols 
rownames(theDf_filt) <- theDf_filt$gene_sym
resLocs_ <- theDf_filt[genes, c("gene_sym", "seqnames", "start", "end")]
resLocs_2 <- resLocs_[abs(resLocs_$start) < 4.5e7, ] # we want to restrict to the window around erbb2
save(resLocs_2, file=paste(theRootDir, "dataOut/resLocs_2.RData", sep="")) # I will also make this same plot in the GDSC cell line dataset...

startVec <- abs(resLocs_2$start)
endVec <- abs(resLocs_2$end)
midVec <- ((startVec + endVec) / 2)
textVec <- names(resMat[resLocs_2$gene_sym, 2])
yVec <- resMat[resLocs_2$gene_sym, 2]
pVec <- resMat[resLocs_2$gene_sym, 1]
segmentsMat <- cbind(startVec, yVec, endVec, yVec)

# Create the plot that has line segments for the length of the genes
svg(paste(theRootDir, "figures/erbb2LocSegs.svg", sep=""), width=5, height=4)
plot(abs(resLocs_2$start)/1000000, resMat[resLocs_2$gene_sym, 2], xlab="Chromosome 17 Location (megabases)", ylab="Effect Size", las=1, pch=20, col="#ffffff", cex.axis=.7)
segments(startVec/1000000, yVec, endVec/1000000, yVec)
segments(startVec/1000000, yVec+0.0005, startVec/1000000, yVec-0.0005)
segments(endVec/1000000, yVec+0.0005, endVec/1000000, yVec-0.0005)
text(midVec/1000000, yVec+0.0013, textVec, cex=.3)
# lines(xLine/1000000, yLine)
dev.off()

# Create a similar plot that just has a point at the midpoint of the gene location, this is probably a lot better.
svg(paste(theRootDir, "figures/erbb2LocPoints.svg", sep=""), width=5, height=4)
plot(midVec/1000000, yVec, xlab="Chromosome 17 Location (megabases)", ylab="Effect Size", las=1, pch=20, col="#00000099", cex.axis=.7)
text(midVec/1000000, yVec+0.0013, textVec, cex=.3)
dev.off()

# Overlay p-values on the plot above with GGplot2....
library(ggplot2)
dat <- data.frame(x=midVec/1000000, y=yVec, Drug=textVec, pVal=-log10(pVec))
svg(paste(theRootDir, "figures/Erbb2Loc_points.svg", sep=""), width=6, height=4)
ggplot(data=dat, aes(x=x, y=y)) + theme_bw() + geom_point(aes(fill=pVal), size=I(3), pch=21) + geom_text(aes(label=Drug), vjust=-.5, hjust=-.24, size=2.5, angle=15) + ylab("Lapatinib Effect Size for HER2+ vs HER2-") + xlab("Chromosome 17 Location (megabases)") + scale_color_continuous(low="steelblue4",high="tomato2", name="-Log10 P-value") + theme(legend.position=c(.9,.2))
dev.off()

# Different version of this plot (with no borders on the points, possibly harder to see some points)
library(ggplot2)
dat <- data.frame(x=midVec/1000000, y=yVec, Drug=textVec, pVal=-log10(pVec))
svg(paste(theRootDir, "figures/Erbb2Loc_points.svg", sep=""), width=7, height=5)
ggplot(data=dat, aes(x=x, y=y)) + theme_bw() + geom_point(aes(color=pVal), size=I(3)) + geom_text(aes(label=Drug), vjust=-.5, hjust=-.24, size=2.5, angle=15) + ylab("Lapatinib Effect Size for HER2+ vs HER2-") + xlab("Chromosome 17 Location (megabases)") + scale_color_continuous(low="steelblue4",high="tomato2", name="-Log10 P-value") + theme(legend.position=c(.9,.2))
dev.off()


#' Apply predictions across all CNVs and drugs for breast cancer samples. Can we identify any novel associations!?
possibleDrugs <- c("A.443654", "A.770041", "ABT.263", "ABT.888", "AG.014699", "AICAR", "AKT.inhibitor.VIII", "AMG.706", "AP.24534", "AS601245", "ATRA", "AUY922", "Axitinib", "AZ628", "AZD.0530", "AZD.2281", "AZD6244", "AZD6482", "AZD7762", "AZD8055", "BAY.61.3606", "Bexarotene", "BI.2536", "BIBW2992", "Bicalutamide", "BI.D1870", "BIRB.0796", "Bleomycin", "BMS.509744", "BMS.536924", "BMS.708163", "BMS.754807", "Bortezomib", "Bosutinib", "Bryostatin.1", "BX.795", "Camptothecin", "CCT007093", "CCT018159", "CEP.701", "CGP.082996", "CGP.60474", "CHIR.99021", "CI.1040", "Cisplatin", "CMK", "Cyclopamine", "Cytarabine", "Dasatinib", "DMOG", "Docetaxel", "Doxorubicin", "EHT.1864", "Elesclomol", "Embelin", "Epothilone.B", "Erlotinib", "Etoposide", "FH535", "FTI.277", "GDC.0449", "GDC0941", "Gefitinib", "Gemcitabine", "GNF.2", "GSK269962A", "GSK.650394", "GW.441756", "GW843682X", "Imatinib", "IPA.3", "JNJ.26854165", "JNK.9L", "JNK.Inhibitor.VIII", "JW.7.52.1", "KIN001.135", "KU.55933", "Lapatinib", "Lenalidomide", "LFM.A13", "Metformin", "Methotrexate", "MG.132", "Midostaurin", "Mitomycin.C", "MK.2206", "MS.275", "Nilotinib", "NSC.87877", "NU.7441", "Nutlin.3a", "NVP.BEZ235", "NVP.TAE684", "Obatoclax.Mesylate", "OSI.906", "PAC.1", "Paclitaxel", "Parthenolide", "Pazopanib", "PD.0325901", "PD.0332991", "PD.173074", "PF.02341066", "PF.4708671", "PF.562271", "PHA.665752", "PLX4720", "Pyrimethamine", "QS11", "Rapamycin", "RDEA119", "RO.3306", "Roscovitine", "Salubrinal", "SB.216763", "SB590885", "Shikonin", "SL.0101.1", "Sorafenib", "S.Trityl.L.cysteine", "Sunitinib", "Temsirolimus", "Thapsigargin", "Tipifarnib", "TW.37", "Vinblastine", "Vinorelbine", "Vorinostat", "VX.680", "VX.702", "WH.4.023", "WO2009093972", "WZ.1.84", "X17.AAG", "X681640", "XMD8.85", "Z.LLNle.CHO", "ZM.447439")
theGeneNames <- rownames(theCnvQuantVecList_mat)
doPredict <- function(k, theDrugPredictions)
{
  predOnAll_residualized <- theDrugPredictions[possibleDrugs[k], ]
  
  # get the cnv data for breast cancer....
  tumSampCnvMat <- theCnvQuantVecList_mat[, tumorSamps]
  oLapSamps <- names(predOnAll_residualized)[names(predOnAll_residualized) %in% colnames(tumSampCnvMat)]

  # Match participants in both datasets.
  patPred <- sapply(strsplit(names(predOnAll_residualized), "-"), function(a)a[3])
  pat01A <- which(sapply(strsplit(names(predOnAll_residualized), "-"), function(a)a[4]) == "01A")
  patCnv <- sapply(strsplit(colnames(tumSampCnvMat), "-"), function(a)a[3])
  colnames(tumSampCnvMat) <- patCnv

  relevantPatients <- patCnv[patCnv %in% intersect(patPred[pat01A], patPred[cancerTypesVec == "BRCA"])] # 01A breast cancer patients

  brca01aPreds <- predOnAll_residualized[intersect(pat01A, which(cancerTypesVec == "BRCA"))]
  brca01aPreds_patNames <- sapply(strsplit(names(brca01aPreds), "-"), function(a)a[3])
  names(brca01aPreds) <- brca01aPreds_patNames

  brca_preds_with_cnvs <- brca01aPreds[relevantPatients]

  tTestAmp <- numeric()
  tTestDir <- numeric()
  wilcoxP <- numeric()
  wilcoxDir <- numeric()
  numAmpVec <- numeric()
  numNotAmpVec <- numeric()
  for(i in 1:nrow(tumSampCnvMat))
  {
    amp <- which(tumSampCnvMat[i, relevantPatients] > 1)
    notAmp <- which(tumSampCnvMat[i, relevantPatients] < 1)
    
    if(length(amp) > 5)
    {
      tTestAmp[i] <- t.test(brca_preds_with_cnvs[relevantPatients][amp], brca_preds_with_cnvs[relevantPatients][notAmp])$p.value # p-value for t-test between cnved and not cnved
      tTestDir[i] <- (mean(brca_preds_with_cnvs[relevantPatients][amp]) - mean(brca_preds_with_cnvs[relevantPatients][notAmp]))
      wilcoxP[i] <- wilcox.test(brca_preds_with_cnvs[relevantPatients][amp], brca_preds_with_cnvs[relevantPatients][notAmp])$p.value
      wilcoxDir[i] <- (median(brca_preds_with_cnvs[relevantPatients][amp]) - median(brca_preds_with_cnvs[relevantPatients][notAmp]))
      numAmpVec[i] <- length(amp)
      numNotAmpVec[i] <- length(notAmp)
    }
  }
  
  outMat <- cbind(tTestAmp, tTestDir, wilcoxP, wilcoxDir, numAmpVec, numNotAmpVec)
  rownames(outMat) <- theGeneNames[1:23046]
  colnames(outMat) <- c("tTestAmp", "tTestDir", "wilcoxP", "wilcoxDir", "numAmpVec", "numNotAmpVec")
  print(k)
  return(outMat)
}
library("parallel")
allBrcaCnaAssocs <- mclapply(1:138, doPredict, allDrugPredictions_mat, mc.cores=12)
names(allBrcaCnaAssocs) <- possibleDrugs

#' Find the best assocation for each drug, (these are wilcoxon rank sum test p-values......)
minGenes <- list()
minGenesDiff <- list()
ampNumList <- list()
notAmpNumList <- list()
for(i in 1:length(possibleDrugs))
{
  minGenes[[i]] <- allBrcaCnaAssocs[[possibleDrugs[i]]][order(allBrcaCnaAssocs[[possibleDrugs[i]]][,3]), ][,3][1]
  minGenesDiff[[i]] <- allBrcaCnaAssocs[[possibleDrugs[i]]][order(allBrcaCnaAssocs[[possibleDrugs[i]]][,3]), ][,4][1]
  ampNumList[[i]] <- allBrcaCnaAssocs[[possibleDrugs[i]]][order(allBrcaCnaAssocs[[possibleDrugs[i]]][,3]), ][,5][1]
  notAmpNumList[[i]] <- allBrcaCnaAssocs[[possibleDrugs[i]]][order(allBrcaCnaAssocs[[possibleDrugs[i]]][,3]), ][,6][1]
}
names(minGenes) <- possibleDrugs
names(minGenesDiff) <- possibleDrugs
names(ampNumList) <- possibleDrugs
names(notAmpNumList) <- possibleDrugs

#' Print the associations in a positive direction (i.e. amplificaiton predictive of resistance)
print(sort(unlist(minGenes[minGenesDiff > 0])))

#' Print the associations in a negative direction, i.e. amplification predictive of drug sensitivity.
print(sort(unlist(minGenes[minGenesDiff < 0])))

#' Make a table of the top association for every drug, with drug, gene, chromsome, chromosome location, P-value, effect size, number of amplified samples, number of not amplified samples.
genes <- sapply(minGenes, names)
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)
columns(org.Hs.eg.db)
resLocs <- select(org.Hs.eg.db, keys= genes, columns = c("SYMBOL","CHR","CHRLOC","CHRLOCEND"), keytype = "SYMBOL")
resNodups <- resLocs[!duplicated(resLocs[, "SYMBOL"]),]
rownames(resNodups) <- resNodups[,1]
chr <- resNodups[genes, 2]
chrLoc <- resNodups[genes, 3]
df <- data.frame(drug=names(minGenes), genes=genes, chr=chr, chrLoc=chrLoc, pVals=unlist(minGenes), beta=unlist(minGenesDiff), numAmp=unlist(ampNumList), numNotAmp=unlist(notAmpNumList))
dfOrd <- df[order(df[,"pVals"]), ]
write.table(dfOrd, file=paste(theRootDir, "tables/top_cnv_drug_assocs_brca.txt", sep=""), row.names=F)

#' Make the plot of Effect-size / P-values for Vinorelbine in the ERLIN2 locus.
topAssocsVbine <- allBrcaCnaAssocs[["Vinorelbine"]][order(allBrcaCnaAssocs[["Vinorelbine"]][,3]), ][1:40,]
resLocs_vBine <- theDf_filt[rownames(topAssocsVbine), c("gene_sym", "seqnames", "start", "end")]
topLocusGenes_vBine <- na.omit(rownames(resLocs_vBine)[resLocs_vBine[, "seqnames"] == "chr8"]) # select out the genes in the region of LOC728024 on chromosome 8, i.e. the region of the strongest association...
resLocs_vBine_2 <- resLocs_vBine[topLocusGenes_vBine, ]
save(resLocs_vBine_2, file=paste(theRootDir, "dataOut/resLocs_vBine_2.RData", sep=""))

#' Overlay p-values on the plot above with GGplot2....
midVec <- ((resLocs_vBine[topLocusGenes_vBine,]$end + resLocs_vBine[topLocusGenes_vBine,]$start) / 2)
yVec <- topAssocsVbine[topLocusGenes_vBine, 2]
textVec <- topLocusGenes_vBine
pVec <- topAssocsVbine[topLocusGenes_vBine, 1]
dat <- data.frame(x=midVec/1000000, y=yVec, Drug=textVec, pVal=-log10(pVec))
svg(paste(theRootDir, "figures/Erlin2Loc_points.svg", sep=""), width=7, height=5)
ggplot(data=dat, aes(x=x, y=y)) + theme_bw() + geom_point(aes(color=pVal), size=I(3)) + geom_text(aes(label=Drug), vjust=-.5, hjust=-.24, size=2.5, angle=15) + ylab("Vinorelbine Effect Size for ERLIN2 Amplified vs Not Amplified") + xlab("Chromosome 8 Location (megabases)") + scale_color_continuous(low="steelblue4",high="tomato2", name="-Log10 P-value") + theme(legend.position=c(.9,.8))
dev.off()

#' Create a scatter plot of the predicted vinorelbine sensitivity Vs ERLIN2 amplification.
predOnAll_residualized <- allDrugPredictions_mat["Vinorelbine", ]
tumSampCnvMat <- theCnvQuantVecList_mat[, tumorSamps]
oLapSamps <- names(predOnAll_residualized)[names(predOnAll_residualized) %in% colnames(tumSampCnvMat)]
patPred <- sapply(strsplit(names(predOnAll_residualized), "-"), function(a)a[3])
pat01A <- which(sapply(strsplit(names(predOnAll_residualized), "-"), function(a)a[4]) == "01A")
patCnv <- sapply(strsplit(colnames(tumSampCnvMat), "-"), function(a)a[3])
colnames(tumSampCnvMat) <- patCnv
relevantPatients <- patCnv[patCnv %in% intersect(patPred[pat01A], patPred[cancerTypesVec == "BRCA"])] # 01A breast cancer patients
brca01aPreds <- predOnAll_residualized[intersect(pat01A, which(cancerTypesVec == "BRCA"))]
brca01aPreds_patNames <- sapply(strsplit(names(brca01aPreds), "-"), function(a)a[3])
names(brca01aPreds) <- brca01aPreds_patNames
brca_vbine_preds_with_cnvs <- brca01aPreds[relevantPatients]
svg(paste(theRootDir, "figures/erlin2vsVinorelbine.svg", sep=""), width=4, height=4)
plot(tumSampCnvMat["ERLIN2", relevantPatients], brca_vbine_preds_with_cnvs, pch=20, col="#00000044", xlab="ERLIN2 CNV", ylab="Predicted Vinorelbine IC50")
dev.off()

#' Create a boxplot stratifying by CNV for vinorelbine and ERLIN2
splitOnCnvNum <- tumSampCnvMat["ERLIN2", relevantPatients]
splitOnCnvNum[splitOnCnvNum < 1] <- 0
splitOnCnvNum[splitOnCnvNum > 1 & splitOnCnvNum < 2] <- 1
splitOnCnvNum[splitOnCnvNum > 2 & splitOnCnvNum < 3] <- 2
splitOnCnvNum[splitOnCnvNum > 3] <- 3
a <- split(brca_vbine_preds_with_cnvs, splitOnCnvNum)
names(a) <- c("<1", "1-2", "2-3", ">3")
svg(paste(theRootDir, "figures/erlin2VsVinorelbine_bplot.svg", sep=""), width=3, height=4)
boxplot(a, col=c("#eff3ff", "#bdd7e7", "#6baed6", "#2171b5"), ylab="Predicted Vinorelbine Sensitivity", xlab="Normalized Copy Number", pch=20, cex.axis=.75, outcol="#00000033")
dev.off()



#' # Do the CNV analysis on the drug predictions when models were applied to only breast cancer data instead of all of TCGA.
load(file=paste(theRootDir, "dataOut/brcaDrugPredsAll.RData", sep="")) # allSizesOut contains the breast cancer specific predictions (correlate with cancer type )
names(allSizesOut) <- possibleDrugs
brcaDrugMat <- do.call(rbind, allSizesOut)
colnames(brcaDrugMat) <- gsub(".", "-", colnames(brcaDrugMat), fixed=T)
allBrcaCnaAssocs_onlyBrcaPreds <- mclapply(1:138, doPredict, brcaDrugMat, mc.cores=12)
names(allBrcaCnaAssocs_onlyBrcaPreds) <- possibleDrugs

#' Find the best assocation for each drug, (these are wilcoxon rank sum test p-values......)
minGenes <- list()
minGenesDiff <- list()
ampNumList <- list()
notAmpNumList <- list()
for(i in 1:length(possibleDrugs))
{
  minGenes[[i]] <- allBrcaCnaAssocs_onlyBrcaPreds[[possibleDrugs[i]]][order(allBrcaCnaAssocs_onlyBrcaPreds[[possibleDrugs[i]]][,3]), ][,3][1]
  minGenesDiff[[i]] <- allBrcaCnaAssocs_onlyBrcaPreds[[possibleDrugs[i]]][order(allBrcaCnaAssocs_onlyBrcaPreds[[possibleDrugs[i]]][,3]), ][,4][1]
  ampNumList[[i]] <- allBrcaCnaAssocs_onlyBrcaPreds[[possibleDrugs[i]]][order(allBrcaCnaAssocs_onlyBrcaPreds[[possibleDrugs[i]]][,3]), ][,5][1]
  notAmpNumList[[i]] <- allBrcaCnaAssocs_onlyBrcaPreds[[possibleDrugs[i]]][order(allBrcaCnaAssocs_onlyBrcaPreds[[possibleDrugs[i]]][,3]), ][,6][1]
}
names(minGenes) <- possibleDrugs
names(minGenesDiff) <- possibleDrugs
names(ampNumList) <- possibleDrugs
names(notAmpNumList) <- possibleDrugs

#' Print the associations in a positive direction (i.e. amplificaiton predictive of resistance)
print(sort(unlist(minGenes[minGenesDiff > 0])))

#' Print the associations in a negative direction, i.e. amplification predictive of drug sensitivity.
print(sort(unlist(minGenes[minGenesDiff < 0])))

# ' What do the correlations for these drug predictions actually look like across the dataset? They are quite highly correlated.
allDrugPredictions_mat_brca <- allDrugPredictions_mat[, colnames(brcaDrugMat)]
theCors <- numeric()
theCorPs <- numeric()
for(i in 1:nrow(allDrugPredictions_mat_brca))
{
  theCors[i] <- cor(allDrugPredictions_mat_brca[i,], brcaDrugMat[i,], method="spearman")
  theCorPs[i] <- cor.test(allDrugPredictions_mat_brca[i,], brcaDrugMat[i,], method="spearman")$p.value
}
names(theCors) <- rownames(allDrugPredictions_mat_brca)
names(theCorPs) <- rownames(allDrugPredictions_mat_brca)

#' The median correlations are high and highly significant....
print(median(theCors))
print(median(theCorPs))

#' Create a histogram of these correlations
svg(paste(theRootDir, "figures/allOrBcHist.svg", sep=""), width=4, height=4)
hist(theCors, col="black", xlab="Spearman Correlation", main="", las=1, cex.axis=.75, breaks=20)
dev.off()


#' Session Info
print(sessionInfo())





