#' This script will do IDWAS analysis on the TCGA breast cancer data. I.e. demonstrating that predictive models can recapitulate what would be expected in terms of drug response. This script creates much of Figure 2 and the reuslts and tables that are related to Figure 2.

#' Set the root directory to the location of the data. This must be modifed for your own use (based on the location of the files).
theRootDir <- "/mnt/data_scratch/finalData/"

#' NB We are using 10 cores for this by default, you may want to adjust this based on the number of cores available on the machine on which you are running this code.
nCores <- 10

#' Load the tpm data for breast cancer RNA-seq data. Log transform.
brcaDataLoc <- paste(theRootDir, "dataIn/rnaSeq/gdac.broadinstitute.org_BRCA.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.Level_3.2015082100.0.0/BRCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt", sep="")
tpmDatMat_bc <- read.delim(brcaDataLoc, as.is=T)
tpmDatMat_bc_tpm <- apply(tpmDatMat_bc[-1,which(tpmDatMat_bc[1,] == "scaled_estimate")], 2, as.numeric)
tpmDatMat_bc_tpm <- tpmDatMat_bc[-1,which(tpmDatMat_bc[1,] == "scaled_estimate")]
tpmDatMat_bc_tpm <- apply(tpmDatMat_bc_tpm, 2, as.numeric)
geneNames <- do.call(cbind, strsplit(tpmDatMat_bc[, "Hybridization.REF"], "|", fixed=TRUE))[1,][-1]
rownames(tpmDatMat_bc_tpm) <- geneNames
colnames(tpmDatMat_bc_tpm) <- substr(colnames(tpmDatMat_bc_tpm), 1, 28)
tpmDatMat_bc_tpm_logged <- log((tpmDatMat_bc_tpm*1000000)+1)

#' Load the pRRophetic library.
library(pRRophetic)

#' Predict Lapatinib sensitivity in all breast cancer samples.
bcaPreds <- pRRopheticPredict(tpmDatMat_bc_tpm_logged, "Lapatinib", selection=1, batchCorrect="standardize", removeLowVaryingGenes=0.2, removeLowVaringGenesFrom="rawData", tissueType="allSolidTumors")

#' Load the matched clinical data for BRCA. This file contains the HER2 status, as measured by immunohistochemistry.
clinicalDataLocation <- paste(theRootDir, "dataIn/clinical/nationwidechildrens.org_clinical_patient_brca.txt", sep="")
clinDataBrca <- read.delim(clinicalDataLocation, as.is=T)
her2status <- clinDataBrca[, "her2_status_by_ihc"]
names(her2status) <- clinDataBrca[, "bcr_patient_barcode"]

#' Identify the HER2+, HER2- and HER2 Equivocal samples. Oh man, there's a mistake here. I've included the normal samples, fuuuuuuuuuucker..... OH OH SPEGETIIOHS. Removeing normal smaples should improve results.
sampleNames <- colnames(tpmDatMat_bc_tpm_logged)
theTumorSamples <- which(substring(sampleNames, 14, 16) == "01A") # identify the tumor samples, tumor samples annotated as "01" by TCGA, normal samples as "10".
newNames <- gsub(".", "-", substring(colnames(tpmDatMat_bc_tpm_logged), 1, 12), fixed=T)
names(bcaPreds) <- newNames
bcaPreds <- bcaPreds[theTumorSamples] # Only include the tumor samples in this analysis. Results on normal samples are meaningless.
sampsInBothDatasets <- clinDataBrca[, "bcr_patient_barcode"][clinDataBrca[, "bcr_patient_barcode"] %in% newNames]
her2Neg <- which(her2status[sampsInBothDatasets] == "Negative")
her2Pos <- which(her2status[sampsInBothDatasets] == "Positive")
her2Equiv <- which(her2status[sampsInBothDatasets] == "Equivocal")

#' Calculate the difference in predicted lapatinib sensitivity between HER2+ and HER2= groups.
print(wilcox.test(bcaPreds[sampsInBothDatasets][her2Neg], bcaPreds[sampsInBothDatasets][her2Pos]))
print(t.test(bcaPreds[sampsInBothDatasets][her2Neg], bcaPreds[sampsInBothDatasets][her2Pos]))

#' Plot the difference in predicted lapatinib sensitivity between the HER2 +/-/Equivocal groups.
svg(paste(theRootDir, "figures/BRCA_boxplot_clinical.svg", sep=""), width=3, height=4)
boxplot(list(Negative=bcaPreds[sampsInBothDatasets][her2Neg], Equivocal=bcaPreds[sampsInBothDatasets][her2Equiv], Positive=bcaPreds[sampsInBothDatasets][her2Pos]), las=1, col=c("#66c2a5", "#fc8d62", "#8da0cb"), pch=20, width=c(.75, .75, .75), ylab="Predicted Lapatinib Sensitivity", cex.axis=.75, outcol="#00000033")
dev.off()

#' Print the numbers of samples in each of the 3 groups.
length(her2Neg)
length(her2Equiv)
length(her2Pos)


#' Also test the results for FISH... her2_fish_status column in the clinical file
her2status_fish <- clinDataBrca[, "her2_fish_status"]
names(her2status_fish) <- clinDataBrca[, "bcr_patient_barcode"]
her2Neg_fish <- which(her2status_fish[sampsInBothDatasets] == "Negative")
her2Pos_fish <- which(her2status_fish[sampsInBothDatasets] == "Positive")
length(her2Neg_fish)
length(her2Pos_fish)
print(wilcox.test(bcaPreds[sampsInBothDatasets][her2Neg_fish], bcaPreds[sampsInBothDatasets][her2Pos_fish]))
print(t.test(bcaPreds[sampsInBothDatasets][her2Neg_fish], bcaPreds[sampsInBothDatasets][her2Pos_fish]))

#' I also want to write out a table with patients who recieved lapatinib/herceptin/Trastuzumab
brcaClinData <- read.delim(paste(theRootDir, "dataIn/clinical/gdac.broadinstitute.org_BRCA.Merge_Clinical.Level_1.2016012800.0.0/BRCA.clin.merged.txt", sep=""), as.is=T, na.strings="somethingthatsnotNA")
rownames(brcaClinData) <- brcaClinData[,1]
brcaClinData_filt <- brcaClinData[, -1]

#' There are 23 rows containing info on drugs taken, the are "patient.drugs.drug.drug_name" and "patient.drugs.drug-2.drug_name"..."patient.drugs.drug-23.drug_name", tcga Ids are in the row patient.bcr_patient_barcode
#' I want to loop through these 23 rows and identify patients who received at least one HER2 targetted therapy ("trastuzumab", "lapatinib" or "herceptin"). Note trastuzumab and herceptin are the same drug, but both of these terms have been used in this data.
theHer2Drugs <- c("trastuzumab", "lapatinib", "herceptin")
patientsOnHerTherapy <- which(brcaClinData_filt["patient.drugs.drug.drug_name", ] %in% theHer2Drugs)
drugMatrixRowNames <- "patient.drugs.drug.drug_name"
for(i in 2:23)
{
  patientsOnHerTherapy <- c(patientsOnHerTherapy, which(brcaClinData_filt[paste("patient.drugs.drug-", i, ".drug_name", sep=""), ] %in% theHer2Drugs))
  drugMatrixRowNames <- c(drugMatrixRowNames, paste("patient.drugs.drug-", i, ".drug_name", sep=""))
}
tcgaIdsOnHer2Therapy <- toupper(substring(brcaClinData_filt["patient.bcr_patient_barcode", ], 9, 12))[unique(patientsOnHerTherapy)]
length(tcgaIdsOnHer2Therapy)
bcaPreds_newNames <- bcaPreds
names(bcaPreds_newNames) <- substring(names(bcaPreds), 9, 12)

#' test the difference in predicted lapatinib response in patients who received a HER2 targetted therapy and those who did not.
t.test(bcaPreds_newNames[tcgaIdsOnHer2Therapy], bcaPreds_newNames[!(names(bcaPreds_newNames) %in% tcgaIdsOnHer2Therapy)])

#' Create a table with the drug info, include a row indicating whether the patient was annotated as having received a HER2 therapy.
drugMatrix <- brcaClinData_filt[drugMatrixRowNames, ]
colnames(drugMatrix) <- toupper(substring(brcaClinData_filt["patient.bcr_patient_barcode", ], 9, 12))
patientAnnotatedWithHer2Drug <- rep("NO", ncol(drugMatrix))
patientAnnotatedWithHer2Drug[which(colnames(drugMatrix) %in% tcgaIdsOnHer2Therapy)] <- "YES"
names(patientAnnotatedWithHer2Drug) <- colnames(drugMatrix)
drugMatrixFin <- rbind(patientAnnotatedWithHer2Drug, drugMatrix)
rownames(drugMatrixFin) <- c("Pateint annotated with HER2 therapy", paste("Drug", 1:nrow(drugMatrix)))
write.csv(drugMatrixFin, file=paste(theRootDir, "dataOut/brcaPatientDrugInfo.csv", sep=""))

#' I want a table with patient ID, predicted lapatinib response, HER2 status by IHC, HER2 staus by FISH and a "Yes" if they received a HER2 targetted therapy.
# This information is contained in the following vectors: bcaPreds, her2status, her2status_fish, patientAnnotatedWithHer2Drug.
# I want the names on each of these vectors to be only TCGA Ids.
names(bcaPreds) <- substring(names(bcaPreds), 9, 12)
names(her2status) <- substring(names(her2status), 9, 12)
names(her2status_fish) <- substring(names(her2status_fish), 9, 12)
samplesWithAll <- intersect(intersect(intersect(names(bcaPreds), names(her2status)), names(her2status_fish)), names(patientAnnotatedWithHer2Drug))
outInfoSuppTabOne <- cbind(samplesWithAll, bcaPreds[samplesWithAll], her2status[samplesWithAll], her2status_fish[samplesWithAll], patientAnnotatedWithHer2Drug[samplesWithAll])
colnames(outInfoSuppTabOne) <- c("TCGA_Sample_ID", "Imputed_Lapatinib_Response", "HER_Status_by_IHC", "Her2_Status_by_FISH", "Patient_Rx_Her2_Therapy")
write.csv(outInfoSuppTabOne, file=paste(theRootDir, "dataOut/brcaPatientInfo_suppTab1.csv", sep=""))


#' We want to find out if the Lapatinib results are drug specific, thus we will compare predicted drug sensitivity between HER2+ and HER2- for all drugs.
#' Note that we will parallelize this operation using the "parallel" library.
library("parallel")
doPredict <- function(drug)
{
  thePred <- pRRopheticPredict(tpmDatMat_bc_tpm_logged, possibleDrugs[drug], selection=1, batchCorrect="standardize", removeLowVaryingGenes=0.2, removeLowVaringGenesFrom="rawData", tissueType="allSolidTumors")
  print(drug)
  return(thePred)
}
possibleDrugs <- c("A.443654", "A.770041", "ABT.263", "ABT.888", "AG.014699", "AICAR", "AKT.inhibitor.VIII", "AMG.706", "AP.24534", "AS601245", "ATRA", "AUY922", "Axitinib", "AZ628", "AZD.0530", "AZD.2281", "AZD6244", "AZD6482", "AZD7762", "AZD8055", "BAY.61.3606", "Bexarotene", "BI.2536", "BIBW2992", "Bicalutamide", "BI.D1870", "BIRB.0796", "Bleomycin", "BMS.509744", "BMS.536924", "BMS.708163", "BMS.754807", "Bortezomib", "Bosutinib", "Bryostatin.1", "BX.795", "Camptothecin", "CCT007093", "CCT018159", "CEP.701", "CGP.082996", "CGP.60474", "CHIR.99021", "CI.1040", "Cisplatin", "CMK", "Cyclopamine", "Cytarabine", "Dasatinib", "DMOG", "Docetaxel", "Doxorubicin", "EHT.1864", "Elesclomol", "Embelin", "Epothilone.B", "Erlotinib", "Etoposide", "FH535", "FTI.277", "GDC.0449", "GDC0941", "Gefitinib", "Gemcitabine", "GNF.2", "GSK269962A", "GSK.650394", "GW.441756", "GW843682X", "Imatinib", "IPA.3", "JNJ.26854165", "JNK.9L", "JNK.Inhibitor.VIII", "JW.7.52.1", "KIN001.135", "KU.55933", "Lapatinib", "Lenalidomide", "LFM.A13", "Metformin", "Methotrexate", "MG.132", "Midostaurin", "Mitomycin.C", "MK.2206", "MS.275", "Nilotinib", "NSC.87877", "NU.7441", "Nutlin.3a", "NVP.BEZ235", "NVP.TAE684", "Obatoclax.Mesylate", "OSI.906", "PAC.1", "Paclitaxel", "Parthenolide", "Pazopanib", "PD.0325901", "PD.0332991", "PD.173074", "PF.02341066", "PF.4708671", "PF.562271", "PHA.665752", "PLX4720", "Pyrimethamine", "QS11", "Rapamycin", "RDEA119", "RO.3306", "Roscovitine", "Salubrinal", "SB.216763", "SB590885", "Shikonin", "SL.0101.1", "Sorafenib", "S.Trityl.L.cysteine", "Sunitinib", "Temsirolimus", "Thapsigargin", "Tipifarnib", "TW.37", "Vinblastine", "Vinorelbine", "Vorinostat", "VX.680", "VX.702", "WH.4.023", "WO2009093972", "WZ.1.84", "X17.AAG", "X681640", "XMD8.85", "Z.LLNle.CHO", "ZM.447439")
allSizesOut_raw <- mclapply(1:138, doPredict, mc.cores=nCores) 
allSizesOut <- allSizesOut_raw
save(allSizesOut, file=paste(theRootDir, "dataOut/brcaDrugPredsAll.RData", sep="")) # this will be used in subsequent analysis....

for(i in 1:length(allSizesOut)) # Remove predictions for non-primay-tumor samples (e.g. matched normal)
{
  allSizesOut[[i]] <- allSizesOut[[i]][theTumorSamples]
}

names(her2status) <- clinDataBrca[, "bcr_patient_barcode"] # set the names on this back to how they were oringally, to match below.
tOut <- numeric()
meanDiff <- numeric()
wilcoxOut <- numeric()
medianDiff <- numeric()
for(i in which(sapply(allSizesOut, class) == "numeric"))
{
  newNames <- gsub(".", "-", substring(names(allSizesOut[[i]]), 1, 12), fixed=T)
  names(allSizesOut[[i]]) <- newNames
  
  sampsInBothDatasets <- clinDataBrca[, "bcr_patient_barcode"][clinDataBrca[, "bcr_patient_barcode"] %in% newNames]
  
  her2Neg <- which(her2status[sampsInBothDatasets] == "Negative")
  her2Pos <- which(her2status[sampsInBothDatasets] == "Positive")
  her2Equiv <- which(her2status[sampsInBothDatasets] == "Equivocal")
  
  tOut[i] <- t.test(allSizesOut[[i]][sampsInBothDatasets][her2Neg], allSizesOut[[i]][sampsInBothDatasets][her2Pos])$p.value
  meanDiff[i] <- mean(allSizesOut[[i]][sampsInBothDatasets][her2Neg]) - mean(allSizesOut[[i]][sampsInBothDatasets][her2Pos])
  
  wilcoxOut[i] <- wilcox.test(allSizesOut[[i]][sampsInBothDatasets][her2Neg], allSizesOut[[i]][sampsInBothDatasets][her2Pos])$p.value
  medianDiff[i] <- median(allSizesOut[[i]][sampsInBothDatasets][her2Neg]) - median(allSizesOut[[i]][sampsInBothDatasets][her2Pos])
}
names(allSizesOut) <- possibleDrugs
names(tOut) <- possibleDrugs
names(meanDiff) <- possibleDrugs
names(wilcoxOut) <- possibleDrugs
names(medianDiff) <- possibleDrugs

write.csv(cbind(possibleDrugs[order(tOut)], tOut[order(tOut)], meanDiff[order(tOut)]), file=paste(theRootDir, "dataOut/brcaHer2VsHer2Neg.csv", sep=""))

print(sort(tOut)[1:10])
print(sort(wilcoxOut)[1:10])

#' Create a histogram of all p-values, highlighting Lapatinib on the tail of the distribution.
svg(paste(theRootDir, "figures/pValsHer2_tTests.svg", sep=""), width=3, height=4)
hist(-log10(tOut), main="", xlab="-log10 P-value", las=1, col="#8dd3c7")
abline(v=-log10(tOut["Lapatinib"]), col="red")
dev.off()

svg(paste(theRootDir, "figures/pValsHer2_wilcoxTests.svg", sep=""), width=3, height=4)
hist(-log10(wilcoxOut), main="", xlab="-log10 P-value", las=1, col="#8dd3c7")
abline(v=-log10(wilcoxOut["Lapatinib"]), col="red")
dev.off()


#' # Re-do the above analysis with predictions that were generated by applying the models across the entire TCGA dataset (i.e. the data created in "getPredsOnAllTCGA_batchCorrData.R"). Are the results consistent with those that we achieve when only applying to breast cancer samples?
#' The purpose of this analysis is the check whether applying these models across all of the TCGA has any noticable affect on the results in any particular cancer type. These results suggest that, at least in this instance, this does not substantially affect the results.

#' Load the predictions created across all of TCGA, created in "getPredsOnAllTCGA_batchCorrData.R"
load(file=paste(theRootDir, "dataOut/allDrugPredictions_mat.RData", sep="")) # allDrugPredictions_mat, cancerTypesVec, 
brcaPreds_allData <- allDrugPredictions_mat["Lapatinib", cancerTypesVec == "BRCA"]

#' Calculate the difference in predicted lapatinib sensitivity between HER2+ and HER2= groups.
newNames <- gsub(".", "-", substring(names(brcaPreds_allData), 1, 12), fixed=T)
names(brcaPreds_allData) <- newNames
sampsInBothDatasets <- clinDataBrca[, "bcr_patient_barcode"][clinDataBrca[, "bcr_patient_barcode"] %in% newNames]
her2Neg <- which(her2status[sampsInBothDatasets] == "Negative")
her2Pos <- which(her2status[sampsInBothDatasets] == "Positive")
her2Equiv <- which(her2status[sampsInBothDatasets] == "Equivocal")
print(wilcox.test(brcaPreds_allData[sampsInBothDatasets][her2Neg], brcaPreds_allData[sampsInBothDatasets][her2Pos]))
print(t.test(brcaPreds_allData[sampsInBothDatasets][her2Neg], brcaPreds_allData[sampsInBothDatasets][her2Pos]))

#' Are these results also drug specific? Yes.
x <- t(allDrugPredictions_mat)
allSizesOut <- split(x, rep(1:ncol(x), each = nrow(x)))
names(allSizesOut) <- rownames(allDrugPredictions_mat)
for(i in 1:length(allSizesOut)){names(allSizesOut[[i]]) <- colnames(allDrugPredictions_mat)}
tOut <- numeric()
meanDiff <- numeric()
wilcoxOut <- numeric()
medianDiff <- numeric()
for(i in which(sapply(allSizesOut, class) == "numeric"))
{
  newNames <- gsub(".", "-", substring(names(allSizesOut[[i]]), 1, 12), fixed=T)
  names(allSizesOut[[i]]) <- newNames
  
  sampsInBothDatasets <- clinDataBrca[, "bcr_patient_barcode"][clinDataBrca[, "bcr_patient_barcode"] %in% newNames]
  
  her2Neg <- which(her2status[sampsInBothDatasets] == "Negative")
  her2Pos <- which(her2status[sampsInBothDatasets] == "Positive")
  her2Equiv <- which(her2status[sampsInBothDatasets] == "Equivocal")
  
  tOut[i] <- t.test(allSizesOut[[i]][sampsInBothDatasets][her2Neg], allSizesOut[[i]][sampsInBothDatasets][her2Pos])$p.value
  meanDiff[i] <- mean(allSizesOut[[i]][sampsInBothDatasets][her2Neg]) - mean(allSizesOut[[i]][sampsInBothDatasets][her2Pos])
  
  wilcoxOut[i] <- wilcox.test(allSizesOut[[i]][sampsInBothDatasets][her2Neg], allSizesOut[[i]][sampsInBothDatasets][her2Pos])$p.value
  medianDiff[i] <- median(allSizesOut[[i]][sampsInBothDatasets][her2Neg]) - median(allSizesOut[[i]][sampsInBothDatasets][her2Pos])
}
names(allSizesOut) <- possibleDrugs
names(tOut) <- possibleDrugs
names(meanDiff) <- possibleDrugs
names(wilcoxOut) <- possibleDrugs
names(medianDiff) <- possibleDrugs

print(sort(tOut)[1:10])
