This script will do IDWAS analysis on the TCGA breast cancer data. I.e. demonstrating that predictive models can recapitulate what would be expected in terms of drug response. This script creates much of Figure 2 and the reuslts and tables that are related to Figure 2. Set the root directory to the location of the data. This must be modifed for your own use (based on the location of the files).
theRootDir <- "/mnt/data_scratch/finalData/"
NB We are using 10 cores for this by default, you may want to adjust this based on the number of cores available on the machine on which you are running this code.
nCores <- 10
Load the tpm data for breast cancer RNA-seq data. Log transform.
brcaDataLoc <- paste(theRootDir, "dataIn/rnaSeq/gdac.broadinstitute.org_BRCA.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.Level_3.2015082100.0.0/BRCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt", sep="")
tpmDatMat_bc <- read.delim(brcaDataLoc, as.is=T)
tpmDatMat_bc_tpm <- apply(tpmDatMat_bc[-1,which(tpmDatMat_bc[1,] == "scaled_estimate")], 2, as.numeric)
tpmDatMat_bc_tpm <- tpmDatMat_bc[-1,which(tpmDatMat_bc[1,] == "scaled_estimate")]
tpmDatMat_bc_tpm <- apply(tpmDatMat_bc_tpm, 2, as.numeric)
geneNames <- do.call(cbind, strsplit(tpmDatMat_bc[, "Hybridization.REF"], "|", fixed=TRUE))[1,][-1]
rownames(tpmDatMat_bc_tpm) <- geneNames
colnames(tpmDatMat_bc_tpm) <- substr(colnames(tpmDatMat_bc_tpm), 1, 28)
tpmDatMat_bc_tpm_logged <- log((tpmDatMat_bc_tpm*1000000)+1)
Load the pRRophetic library.
library(pRRophetic)
## Warning: replacing previous import by 'genefilter::Anova' when loading
## 'pRRophetic'
Predict Lapatinib sensitivity in all breast cancer samples.
bcaPreds <- pRRopheticPredict(tpmDatMat_bc_tpm_logged, "Lapatinib", selection=1, batchCorrect="standardize", removeLowVaryingGenes=0.2, removeLowVaringGenesFrom="rawData", tissueType="allSolidTumors")
##
## 11503 gene identifiers overlap between the supplied expression matrices...
##
##
## 3601 low variabilty genes filtered.
## Fitting Ridge Regression model... Done
##
## Calculating predicted phenotype...Done
Load the matched clinical data for BRCA. This file contains the HER2 status, as measured by immunohistochemistry.
clinicalDataLocation <- paste(theRootDir, "dataIn/clinical/nationwidechildrens.org_clinical_patient_brca.txt", sep="")
clinDataBrca <- read.delim(clinicalDataLocation, as.is=T)
her2status <- clinDataBrca[, "her2_status_by_ihc"]
names(her2status) <- clinDataBrca[, "bcr_patient_barcode"]
Identify the HER2+, HER2- and HER2 Equivocal samples. Oh man, there's a mistake here. I've included the normal samples, fuuuuuuuuuucker….. OH OH SPEGETIIOHS. Removeing normal smaples should improve results.
sampleNames <- colnames(tpmDatMat_bc_tpm_logged)
theTumorSamples <- which(substring(sampleNames, 14, 16) == "01A") # identify the tumor samples, tumor samples annotated as "01" by TCGA, normal samples as "10".
newNames <- gsub(".", "-", substring(colnames(tpmDatMat_bc_tpm_logged), 1, 12), fixed=T)
names(bcaPreds) <- newNames
bcaPreds <- bcaPreds[theTumorSamples] # Only include the tumor samples in this analysis. Results on normal samples are meaningless.
sampsInBothDatasets <- clinDataBrca[, "bcr_patient_barcode"][clinDataBrca[, "bcr_patient_barcode"] %in% newNames]
her2Neg <- which(her2status[sampsInBothDatasets] == "Negative")
her2Pos <- which(her2status[sampsInBothDatasets] == "Positive")
her2Equiv <- which(her2status[sampsInBothDatasets] == "Equivocal")
Calculate the difference in predicted lapatinib sensitivity between HER2+ and HER2= groups.
print(wilcox.test(bcaPreds[sampsInBothDatasets][her2Neg], bcaPreds[sampsInBothDatasets][her2Pos]))
##
## Wilcoxon rank sum test with continuity correction
##
## data: bcaPreds[sampsInBothDatasets][her2Neg] and bcaPreds[sampsInBothDatasets][her2Pos]
## W = 60329, p-value = 2.617e-11
## alternative hypothesis: true location shift is not equal to 0
print(t.test(bcaPreds[sampsInBothDatasets][her2Neg], bcaPreds[sampsInBothDatasets][her2Pos]))
##
## Welch Two Sample t-test
##
## data: bcaPreds[sampsInBothDatasets][her2Neg] and bcaPreds[sampsInBothDatasets][her2Pos]
## t = 7.5233, df = 289.58, p-value = 6.768e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1748531 0.2987541
## sample estimates:
## mean of x mean of y
## 4.136513 3.899710
Plot the difference in predicted lapatinib sensitivity between the HER2 +/-/Equivocal groups.
svg(paste(theRootDir, "figures/BRCA_boxplot_clinical.svg", sep=""), width=3, height=4)
boxplot(list(Negative=bcaPreds[sampsInBothDatasets][her2Neg], Equivocal=bcaPreds[sampsInBothDatasets][her2Equiv], Positive=bcaPreds[sampsInBothDatasets][her2Pos]), las=1, col=c("#66c2a5", "#fc8d62", "#8da0cb"), pch=20, width=c(.75, .75, .75), ylab="Predicted Lapatinib Sensitivity", cex.axis=.75, outcol="#00000033")
dev.off()
## png
## 2
Print the numbers of samples in each of the 3 groups.
length(her2Neg)
## [1] 560
length(her2Equiv)
## [1] 179
length(her2Pos)
## [1] 164
Also test the results for FISH… her2_fish_status column in the clinical file
her2status_fish <- clinDataBrca[, "her2_fish_status"]
names(her2status_fish) <- clinDataBrca[, "bcr_patient_barcode"]
her2Neg_fish <- which(her2status_fish[sampsInBothDatasets] == "Negative")
her2Pos_fish <- which(her2status_fish[sampsInBothDatasets] == "Positive")
length(her2Neg_fish)
## [1] 333
length(her2Pos_fish)
## [1] 78
print(wilcox.test(bcaPreds[sampsInBothDatasets][her2Neg_fish], bcaPreds[sampsInBothDatasets][her2Pos_fish]))
##
## Wilcoxon rank sum test with continuity correction
##
## data: bcaPreds[sampsInBothDatasets][her2Neg_fish] and bcaPreds[sampsInBothDatasets][her2Pos_fish]
## W = 16120, p-value = 0.000104
## alternative hypothesis: true location shift is not equal to 0
print(t.test(bcaPreds[sampsInBothDatasets][her2Neg_fish], bcaPreds[sampsInBothDatasets][her2Pos_fish]))
##
## Welch Two Sample t-test
##
## data: bcaPreds[sampsInBothDatasets][her2Neg_fish] and bcaPreds[sampsInBothDatasets][her2Pos_fish]
## t = 4.5957, df = 137.62, p-value = 9.655e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1060258 0.2661637
## sample estimates:
## mean of x mean of y
## 4.107303 3.921209
I also want to write out a table with patients who recieved lapatinib/herceptin/Trastuzumab
brcaClinData <- read.delim(paste(theRootDir, "dataIn/clinical/gdac.broadinstitute.org_BRCA.Merge_Clinical.Level_1.2016012800.0.0/BRCA.clin.merged.txt", sep=""), as.is=T, na.strings="somethingthatsnotNA")
rownames(brcaClinData) <- brcaClinData[,1]
brcaClinData_filt <- brcaClinData[, -1]
There are 23 rows containing info on drugs taken, the are “patient.drugs.drug.drug_name” and “patient.drugs.drug-2.drug_name”…“patient.drugs.drug-23.drug_name”, tcga Ids are in the row patient.bcr_patient_barcode I want to loop through these 23 rows and identify patients who received at least one HER2 targetted therapy (“trastuzumab”, “lapatinib” or “herceptin”). Note trastuzumab and herceptin are the same drug, but both of these terms have been used in this data.
theHer2Drugs <- c("trastuzumab", "lapatinib", "herceptin")
patientsOnHerTherapy <- which(brcaClinData_filt["patient.drugs.drug.drug_name", ] %in% theHer2Drugs)
drugMatrixRowNames <- "patient.drugs.drug.drug_name"
for(i in 2:23)
{
patientsOnHerTherapy <- c(patientsOnHerTherapy, which(brcaClinData_filt[paste("patient.drugs.drug-", i, ".drug_name", sep=""), ] %in% theHer2Drugs))
drugMatrixRowNames <- c(drugMatrixRowNames, paste("patient.drugs.drug-", i, ".drug_name", sep=""))
}
tcgaIdsOnHer2Therapy <- toupper(substring(brcaClinData_filt["patient.bcr_patient_barcode", ], 9, 12))[unique(patientsOnHerTherapy)]
length(tcgaIdsOnHer2Therapy)
## [1] 73
bcaPreds_newNames <- bcaPreds
names(bcaPreds_newNames) <- substring(names(bcaPreds), 9, 12)
test the difference in predicted lapatinib response in patients who received a HER2 targetted therapy and those who did not.
t.test(bcaPreds_newNames[tcgaIdsOnHer2Therapy], bcaPreds_newNames[!(names(bcaPreds_newNames) %in% tcgaIdsOnHer2Therapy)])
##
## Welch Two Sample t-test
##
## data: bcaPreds_newNames[tcgaIdsOnHer2Therapy] and bcaPreds_newNames[!(names(bcaPreds_newNames) %in% tcgaIdsOnHer2Therapy)]
## t = -6.083, df = 83.583, p-value = 3.412e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3473851 -0.1762048
## sample estimates:
## mean of x mean of y
## 3.842224 4.104019
Create a table with the drug info, include a row indicating whether the patient was annotated as having received a HER2 therapy.
drugMatrix <- brcaClinData_filt[drugMatrixRowNames, ]
colnames(drugMatrix) <- toupper(substring(brcaClinData_filt["patient.bcr_patient_barcode", ], 9, 12))
patientAnnotatedWithHer2Drug <- rep("NO", ncol(drugMatrix))
patientAnnotatedWithHer2Drug[which(colnames(drugMatrix) %in% tcgaIdsOnHer2Therapy)] <- "YES"
names(patientAnnotatedWithHer2Drug) <- colnames(drugMatrix)
drugMatrixFin <- rbind(patientAnnotatedWithHer2Drug, drugMatrix)
rownames(drugMatrixFin) <- c("Pateint annotated with HER2 therapy", paste("Drug", 1:nrow(drugMatrix)))
write.csv(drugMatrixFin, file=paste(theRootDir, "dataOut/brcaPatientDrugInfo.csv", sep=""))
I want a table with patient ID, predicted lapatinib response, HER2 status by IHC, HER2 staus by FISH and a “Yes” if they received a HER2 targetted therapy.
# This information is contained in the following vectors: bcaPreds, her2status, her2status_fish, patientAnnotatedWithHer2Drug.
# I want the names on each of these vectors to be only TCGA Ids.
names(bcaPreds) <- substring(names(bcaPreds), 9, 12)
names(her2status) <- substring(names(her2status), 9, 12)
names(her2status_fish) <- substring(names(her2status_fish), 9, 12)
samplesWithAll <- intersect(intersect(intersect(names(bcaPreds), names(her2status)), names(her2status_fish)), names(patientAnnotatedWithHer2Drug))
outInfoSuppTabOne <- cbind(samplesWithAll, bcaPreds[samplesWithAll], her2status[samplesWithAll], her2status_fish[samplesWithAll], patientAnnotatedWithHer2Drug[samplesWithAll])
colnames(outInfoSuppTabOne) <- c("TCGA_Sample_ID", "Imputed_Lapatinib_Response", "HER_Status_by_IHC", "Her2_Status_by_FISH", "Patient_Rx_Her2_Therapy")
write.csv(outInfoSuppTabOne, file=paste(theRootDir, "dataOut/brcaPatientInfo_suppTab1.csv", sep=""))
We want to find out if the Lapatinib results are drug specific, thus we will compare predicted drug sensitivity between HER2+ and HER2- for all drugs. Note that we will parallelize this operation using the “parallel” library.
library("parallel")
doPredict <- function(drug)
{
thePred <- pRRopheticPredict(tpmDatMat_bc_tpm_logged, possibleDrugs[drug], selection=1, batchCorrect="standardize", removeLowVaryingGenes=0.2, removeLowVaringGenesFrom="rawData", tissueType="allSolidTumors")
print(drug)
return(thePred)
}
possibleDrugs <- c("A.443654", "A.770041", "ABT.263", "ABT.888", "AG.014699", "AICAR", "AKT.inhibitor.VIII", "AMG.706", "AP.24534", "AS601245", "ATRA", "AUY922", "Axitinib", "AZ628", "AZD.0530", "AZD.2281", "AZD6244", "AZD6482", "AZD7762", "AZD8055", "BAY.61.3606", "Bexarotene", "BI.2536", "BIBW2992", "Bicalutamide", "BI.D1870", "BIRB.0796", "Bleomycin", "BMS.509744", "BMS.536924", "BMS.708163", "BMS.754807", "Bortezomib", "Bosutinib", "Bryostatin.1", "BX.795", "Camptothecin", "CCT007093", "CCT018159", "CEP.701", "CGP.082996", "CGP.60474", "CHIR.99021", "CI.1040", "Cisplatin", "CMK", "Cyclopamine", "Cytarabine", "Dasatinib", "DMOG", "Docetaxel", "Doxorubicin", "EHT.1864", "Elesclomol", "Embelin", "Epothilone.B", "Erlotinib", "Etoposide", "FH535", "FTI.277", "GDC.0449", "GDC0941", "Gefitinib", "Gemcitabine", "GNF.2", "GSK269962A", "GSK.650394", "GW.441756", "GW843682X", "Imatinib", "IPA.3", "JNJ.26854165", "JNK.9L", "JNK.Inhibitor.VIII", "JW.7.52.1", "KIN001.135", "KU.55933", "Lapatinib", "Lenalidomide", "LFM.A13", "Metformin", "Methotrexate", "MG.132", "Midostaurin", "Mitomycin.C", "MK.2206", "MS.275", "Nilotinib", "NSC.87877", "NU.7441", "Nutlin.3a", "NVP.BEZ235", "NVP.TAE684", "Obatoclax.Mesylate", "OSI.906", "PAC.1", "Paclitaxel", "Parthenolide", "Pazopanib", "PD.0325901", "PD.0332991", "PD.173074", "PF.02341066", "PF.4708671", "PF.562271", "PHA.665752", "PLX4720", "Pyrimethamine", "QS11", "Rapamycin", "RDEA119", "RO.3306", "Roscovitine", "Salubrinal", "SB.216763", "SB590885", "Shikonin", "SL.0101.1", "Sorafenib", "S.Trityl.L.cysteine", "Sunitinib", "Temsirolimus", "Thapsigargin", "Tipifarnib", "TW.37", "Vinblastine", "Vinorelbine", "Vorinostat", "VX.680", "VX.702", "WH.4.023", "WO2009093972", "WZ.1.84", "X17.AAG", "X681640", "XMD8.85", "Z.LLNle.CHO", "ZM.447439")
allSizesOut_raw <- mclapply(1:138, doPredict, mc.cores=nCores)
allSizesOut <- allSizesOut_raw
save(allSizesOut, file=paste(theRootDir, "dataOut/brcaDrugPredsAll.RData", sep="")) # this will be used in subsequent analysis....
for(i in 1:length(allSizesOut)) # Remove predictions for non-primay-tumor samples (e.g. matched normal)
{
allSizesOut[[i]] <- allSizesOut[[i]][theTumorSamples]
}
names(her2status) <- clinDataBrca[, "bcr_patient_barcode"] # set the names on this back to how they were oringally, to match below.
tOut <- numeric()
meanDiff <- numeric()
wilcoxOut <- numeric()
medianDiff <- numeric()
for(i in which(sapply(allSizesOut, class) == "numeric"))
{
newNames <- gsub(".", "-", substring(names(allSizesOut[[i]]), 1, 12), fixed=T)
names(allSizesOut[[i]]) <- newNames
sampsInBothDatasets <- clinDataBrca[, "bcr_patient_barcode"][clinDataBrca[, "bcr_patient_barcode"] %in% newNames]
her2Neg <- which(her2status[sampsInBothDatasets] == "Negative")
her2Pos <- which(her2status[sampsInBothDatasets] == "Positive")
her2Equiv <- which(her2status[sampsInBothDatasets] == "Equivocal")
tOut[i] <- t.test(allSizesOut[[i]][sampsInBothDatasets][her2Neg], allSizesOut[[i]][sampsInBothDatasets][her2Pos])$p.value
meanDiff[i] <- mean(allSizesOut[[i]][sampsInBothDatasets][her2Neg]) - mean(allSizesOut[[i]][sampsInBothDatasets][her2Pos])
wilcoxOut[i] <- wilcox.test(allSizesOut[[i]][sampsInBothDatasets][her2Neg], allSizesOut[[i]][sampsInBothDatasets][her2Pos])$p.value
medianDiff[i] <- median(allSizesOut[[i]][sampsInBothDatasets][her2Neg]) - median(allSizesOut[[i]][sampsInBothDatasets][her2Pos])
}
names(allSizesOut) <- possibleDrugs
names(tOut) <- possibleDrugs
names(meanDiff) <- possibleDrugs
names(wilcoxOut) <- possibleDrugs
names(medianDiff) <- possibleDrugs
write.csv(cbind(possibleDrugs[order(tOut)], tOut[order(tOut)], meanDiff[order(tOut)]), file=paste(theRootDir, "dataOut/brcaHer2VsHer2Neg.csv", sep=""))
print(sort(tOut)[1:10])
## Lapatinib A.443654 Bosutinib X681640 Methotrexate
## 6.767662e-13 1.070916e-11 1.086888e-11 1.686819e-11 2.712440e-11
## PF.4708671 AP.24534 AZD.2281 Tipifarnib Roscovitine
## 7.856292e-10 1.433591e-09 1.829792e-09 1.480258e-08 1.700331e-08
print(sort(wilcoxOut)[1:10])
## Lapatinib A.443654 X681640 Bosutinib Roscovitine
## 2.616796e-11 6.905257e-10 8.541356e-10 9.067471e-10 1.298988e-09
## AP.24534 Tipifarnib Methotrexate PF.4708671 AZD.2281
## 1.845042e-09 2.375066e-09 4.158808e-09 5.095230e-09 5.436627e-09
Create a histogram of all p-values, highlighting Lapatinib on the tail of the distribution.
svg(paste(theRootDir, "figures/pValsHer2_tTests.svg", sep=""), width=3, height=4)
hist(-log10(tOut), main="", xlab="-log10 P-value", las=1, col="#8dd3c7")
abline(v=-log10(tOut["Lapatinib"]), col="red")
dev.off()
## png
## 2
svg(paste(theRootDir, "figures/pValsHer2_wilcoxTests.svg", sep=""), width=3, height=4)
hist(-log10(wilcoxOut), main="", xlab="-log10 P-value", las=1, col="#8dd3c7")
abline(v=-log10(wilcoxOut["Lapatinib"]), col="red")
dev.off()
## png
## 2
The purpose of this analysis is the check whether applying these models across all of the TCGA has any noticable affect on the results in any particular cancer type. These results suggest that, at least in this instance, this does not substantially affect the results. Load the predictions created across all of TCGA, created in “getPredsOnAllTCGA_batchCorrData.R”
load(file=paste(theRootDir, "dataOut/allDrugPredictions_mat.RData", sep="")) # allDrugPredictions_mat, cancerTypesVec,
brcaPreds_allData <- allDrugPredictions_mat["Lapatinib", cancerTypesVec == "BRCA"]
Calculate the difference in predicted lapatinib sensitivity between HER2+ and HER2= groups.
newNames <- gsub(".", "-", substring(names(brcaPreds_allData), 1, 12), fixed=T)
names(brcaPreds_allData) <- newNames
sampsInBothDatasets <- clinDataBrca[, "bcr_patient_barcode"][clinDataBrca[, "bcr_patient_barcode"] %in% newNames]
her2Neg <- which(her2status[sampsInBothDatasets] == "Negative")
her2Pos <- which(her2status[sampsInBothDatasets] == "Positive")
her2Equiv <- which(her2status[sampsInBothDatasets] == "Equivocal")
print(wilcox.test(brcaPreds_allData[sampsInBothDatasets][her2Neg], brcaPreds_allData[sampsInBothDatasets][her2Pos]))
##
## Wilcoxon rank sum test with continuity correction
##
## data: brcaPreds_allData[sampsInBothDatasets][her2Neg] and brcaPreds_allData[sampsInBothDatasets][her2Pos]
## W = 60958, p-value = 1.727e-10
## alternative hypothesis: true location shift is not equal to 0
print(t.test(brcaPreds_allData[sampsInBothDatasets][her2Neg], brcaPreds_allData[sampsInBothDatasets][her2Pos]))
##
## Welch Two Sample t-test
##
## data: brcaPreds_allData[sampsInBothDatasets][her2Neg] and brcaPreds_allData[sampsInBothDatasets][her2Pos]
## t = 7.5501, df = 301.99, p-value = 5.188e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1116788 0.1904168
## sample estimates:
## mean of x mean of y
## 4.453504 4.302457
Are these results also drug specific? Yes.
x <- t(allDrugPredictions_mat)
allSizesOut <- split(x, rep(1:ncol(x), each = nrow(x)))
names(allSizesOut) <- rownames(allDrugPredictions_mat)
for(i in 1:length(allSizesOut)){names(allSizesOut[[i]]) <- colnames(allDrugPredictions_mat)}
tOut <- numeric()
meanDiff <- numeric()
wilcoxOut <- numeric()
medianDiff <- numeric()
for(i in which(sapply(allSizesOut, class) == "numeric"))
{
newNames <- gsub(".", "-", substring(names(allSizesOut[[i]]), 1, 12), fixed=T)
names(allSizesOut[[i]]) <- newNames
sampsInBothDatasets <- clinDataBrca[, "bcr_patient_barcode"][clinDataBrca[, "bcr_patient_barcode"] %in% newNames]
her2Neg <- which(her2status[sampsInBothDatasets] == "Negative")
her2Pos <- which(her2status[sampsInBothDatasets] == "Positive")
her2Equiv <- which(her2status[sampsInBothDatasets] == "Equivocal")
tOut[i] <- t.test(allSizesOut[[i]][sampsInBothDatasets][her2Neg], allSizesOut[[i]][sampsInBothDatasets][her2Pos])$p.value
meanDiff[i] <- mean(allSizesOut[[i]][sampsInBothDatasets][her2Neg]) - mean(allSizesOut[[i]][sampsInBothDatasets][her2Pos])
wilcoxOut[i] <- wilcox.test(allSizesOut[[i]][sampsInBothDatasets][her2Neg], allSizesOut[[i]][sampsInBothDatasets][her2Pos])$p.value
medianDiff[i] <- median(allSizesOut[[i]][sampsInBothDatasets][her2Neg]) - median(allSizesOut[[i]][sampsInBothDatasets][her2Pos])
}
names(allSizesOut) <- possibleDrugs
names(tOut) <- possibleDrugs
names(meanDiff) <- possibleDrugs
names(wilcoxOut) <- possibleDrugs
names(medianDiff) <- possibleDrugs
print(sort(tOut)[1:10])
## Lapatinib X681640 WZ.1.84
## 5.188159e-13 3.376679e-11 1.736731e-09
## Obatoclax.Mesylate AZD6482 GDC0941
## 5.929269e-09 1.950136e-08 3.222777e-08
## MK.2206 PLX4720 AZD7762
## 3.547856e-08 5.031109e-08 7.223677e-08
## JW.7.52.1
## 1.392233e-06