In this script I want to apply the models across every tcga sample. I will look at the association of CNVs and drug sensitivity. In particular I am interested in whether the ERBB2 and lapatinib association is recovered here. Load required libraries and set the root directory.
library("pRRophetic")
## Warning: replacing previous import by 'genefilter::Anova' when loading
## 'pRRophetic'
library("parallel")
library("gdata")
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
theRootDir <- "/mnt/data_scratch/finalData/"
Define a simple function to turn one tailed p-values into two tailed p-values (e.g. those from a linear regression)
makeOneTail <- function(pvals, betaVals, alternative="greater")
{
outVec <- numeric()
if(alternative=="greater")
{
for(i in 1:length(pvals))
{
if((betaVals[i] > 0))
outVec[i] <- pvals[i] / 2
else
outVec[i] <- 1 - (pvals[i] / 2)
}
}
else if(alternative == "less")
{
for(i in 1:length(pvals))
{
if((betaVals[i] < 0))
outVec[i] <- pvals[i] / 2
else
outVec[i] <- 1 - (pvals[i] / 2)
}
}
names(outVec) <- names(pvals)
return(outVec)
}
Load the expression data….
load(file=paste(theRootDir, "/dataIn/tenRuvNewStandardApproach.RData", sep="")) # cancerTypesVec, tenRuvNewStandardApproach
colnames(tenRuvNewStandardApproach) <- gsub(".", "-", colnames(tenRuvNewStandardApproach), fixed=T)
Load the cnv data for all of TCGA
diseaseAbbrvs <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "DLBC", "GBM", "HNSC", "KICH", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")
theCnvQuantVecList_mat_list <- list()
tumorSamps_list <- list()
gnamesList <- list()
for(i in 1:length(diseaseAbbrvs))
{
load(file=paste(theRootDir, "/dataIn/tcga_cnv_subtracted/cnvsMappedToGenes/", diseaseAbbrvs[i], ".RData", sep="")) # theCnvQuantVecList_mat, tumorSamps
theCnvQuantVecList_mat_list[[i]] <- theCnvQuantVecList_mat
tumorSamps_list[[i]] <- tumorSamps
gnamesList[[i]] <- gnamesList
}
bigCnvMat <- do.call(cbind, theCnvQuantVecList_mat_list)
Create a vector of the cancer types that corresponds to the CNV data. I will use this below.
canTypesVec <- character()
for(i in 1:length(diseaseAbbrvs))
{
thisTypeVec <- rep(diseaseAbbrvs[i], ncol(theCnvQuantVecList_mat_list[[i]]))
canTypesVec <- c(canTypesVec, thisTypeVec)
}
names(canTypesVec) <- colnames(bigCnvMat)
From bigCnvMat, I need to find the 01A samples and the non-duplicates. 01a samples are the tumor samples.
bigCnv01A <- which(sapply(strsplit(colnames(bigCnvMat), "-"), function(a)a[4]) == "01A")
bigCnv01AMat <- bigCnvMat[, bigCnv01A]
canTypesVec01A <- canTypesVec[bigCnv01A]
patBigCnv <- sapply(strsplit(colnames(bigCnv01AMat), "-"), function(a)a[3])
colnames(bigCnv01AMat) <- patBigCnv # change the colum name to be the patient ids....
names(canTypesVec01A) <- patBigCnv
What effect does including cancer type as a co-variate have on the results? Load the drug prediction data and subset to those patients for which we also have CNV data.
load(file=paste(theRootDir, "dataOut/allDrugPredictions_mat.RData", sep="")) # allDrugPredictions_mat, cancerTypesVec
colnames(allDrugPredictions_mat) <- gsub(".", "-", colnames(allDrugPredictions_mat), fixed=T)
all01ASamples <- colnames(allDrugPredictions_mat)[which(sapply(strsplit(colnames(allDrugPredictions_mat), "-"), function(a)a[4]) == "01A")]
all01patientIds <- sapply(strsplit(all01ASamples, "-"), function(a)a[3])
allDrugPredictions_mat_01A <- allDrugPredictions_mat[, all01ASamples]
colnames(allDrugPredictions_mat_01A) <- all01patientIds
commonPats <- patBigCnv[patBigCnv %in% all01patientIds] # patients common to both cnv and prediction data....
bigCnvMatCommonPats <- bigCnv01AMat[, commonPats]
canTypesVec01A_com <- canTypesVec01A[commonPats]
allDrugPredictions_mat_01A_com <- allDrugPredictions_mat_01A[, commonPats]
Run an analysis for ERBB2 amplification against drug sensitivty, with and without cancer type included as a factor.
pVals <- numeric()
betaVal <- numeric()
pVals_withType <- numeric()
betaVal_withType <- numeric()
erbb2Amp <- as.numeric(bigCnvMatCommonPats["ERBB2",] > 1)
for(i in 1:nrow(allDrugPredictions_mat_01A))
{
# control for cancer type
pVals_withType[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp+factor(canTypesVec01A_com))))[2,4]
betaVal_withType[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp+factor(canTypesVec01A_com))))[2,1]
# don't control for cancer type
pVals[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp)))[2,4]
betaVal[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp)))[2,1]
# print(i)
}
names(pVals) <- rownames(allDrugPredictions_mat_01A)
names(pVals_withType) <- rownames(allDrugPredictions_mat_01A)
Print some of the top P-values for lapatinib and ERBB2, with and without controlling for cancer type. These are one tailed statisitcs as we are asking what is more likely to be effective in ERBB2 positive cancers.
psOne_withType <- makeOneTail(pVals_withType, betaVal_withType, alternative="less")
psOne <- makeOneTail(pVals, betaVal, alternative="less")
print(sort(psOne_withType)[1:10])
## Lapatinib A.443654 PF.4708671
## 2.386063e-10 6.307226e-08 7.753145e-08
## AKT.inhibitor.VIII MK.2206 WZ.1.84
## 3.255794e-06 8.739821e-06 1.651061e-05
## AZD6482 Bicalutamide BIBW2992
## 3.132285e-05 5.815255e-05 7.887527e-05
## JW.7.52.1
## 9.197318e-05
print(sort(psOne)[1:20])
## MK.2206 A.443654 AKT.inhibitor.VIII
## 1.534971e-28 4.719192e-21 3.697310e-20
## PF.4708671 GDC0941 BIBW2992
## 1.094027e-16 8.942220e-16 4.509543e-14
## AZD6482 X17.AAG JW.7.52.1
## 1.307751e-12 9.743849e-11 2.109262e-09
## Z.LLNle.CHO AICAR GW.441756
## 2.201576e-08 4.052831e-07 1.630904e-06
## S.Trityl.L.cysteine WZ.1.84 BI.2536
## 1.770288e-06 3.579372e-06 3.990303e-06
## PAC.1 Lapatinib Rapamycin
## 4.834677e-06 8.568504e-06 1.268821e-05
## Pazopanib CGP.082996
## 1.716464e-05 2.376135e-05
svg(paste(theRootDir, "figures/lapatinib_noCorrection.svg", sep=""), width=4, height=4)
hist(-log10(psOne), breaks=100, col="#8dd3c7", main="Lapatinib", las=1, xlab=expression("-Log"[10]*"P-value"), cex.axis=0.75)
abline(v=-log10(psOne["Lapatinib"]), col="red")
dev.off()
## png
## 2
svg(paste(theRootDir, "figures/lapatinib_corrForType.svg", sep=""), width=4, height=4)
hist(-log10(psOne_withType), breaks=100, col="#8dd3c7", main="Lapatinib", las=1, xlab=expression("-Log"[10]*"P-value"), cex.axis=0.75)
abline(v=-log10(psOne_withType["Lapatinib"]), col="red")
dev.off()
## png
## 2
Test every drug against EVERY CNV, while controlling for cancer type. I will parallelize this over 4 cores. NB, you may want to reduce that number based on the number of available cores.
bigCnvMatCommonPats_amps <- apply(bigCnvMatCommonPats, 2, function(theCol)return(as.numeric(theCol > 1)))
rownames(bigCnvMatCommonPats_amps) <- rownames(bigCnvMatCommonPats)
bigCnvMatCommonPats_dels <- apply(bigCnvMatCommonPats, 2, function(theCol)return(as.numeric(theCol < -1)))
rownames(bigCnvMatCommonPats_dels) <- rownames(bigCnvMatCommonPats)
theFun <- function(j)
{
pVals <- numeric()
betaVal <- numeric()
if(sum(na.omit(bigCnvMatCommonPats_amps[j, ])) > 50) # make sure the gene is amplifed at least 20 times
{
for(i in 1:nrow(allDrugPredictions_mat_01A_com))
{
theMod <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~bigCnvMatCommonPats_amps[j, ]+factor(canTypesVec01A_com))))
pVals[i] <- theMod[2,4]
betaVal[i] <- theMod[2,1]
}
names(pVals) <- rownames(allDrugPredictions_mat_01A_com)
names(betaVal) <- rownames(allDrugPredictions_mat_01A_com)
}
return(list(pVals, betaVal))
}
allCors <- mclapply(1:nrow(bigCnvMatCommonPats), theFun, mc.cores=4) # NB: You may want to change this number based on the number of available cores.
names(allCors) <- rownames(bigCnvMatCommonPats)
hasAmps <- apply(bigCnvMatCommonPats_amps, 1, function(theRow)return(sum(na.omit(theRow)) > 50)) # restrict analysis to CNAs that occur in 50 or more samples.
allCors_hasAmps <- allCors[hasAmps]
pVals <- sapply(allCors_hasAmps, function(item)return(item[[1]]))
betas <- sapply(allCors_hasAmps, function(item)return(item[[2]]))
print(sort(unmatrix(pVals))[1:10]) # its going to be difficult to get at causality in a systematic way here....
## OSI.906:RP11-434C1.1 PF.4708671:MIR5708 PF.4708671:MRPS28
## 2.404729e-37 9.566338e-37 4.901663e-36
## OSI.906:PRB3 OSI.906:PRB1 OSI.906:PRB2
## 2.608545e-35 2.726659e-35 2.726659e-35
## PF.4708671:TPD52 OSI.906:KLRC4 OSI.906:KLRD1
## 3.528239e-35 1.687924e-34 1.693637e-34
## OSI.906:KLRC4-KLRK1
## 1.695912e-34
print(pVals["Lapatinib", "ERBB2"])
## [1] 4.772127e-10
Lets also incorporate glds
drugRelatedness <- read.csv(paste(theRootDir, "dataIn/categorys.csv", sep=""), as.is=TRUE)
theseDrugNames <- rownames(allDrugPredictions_mat_01A_com)
drugRelatedness[, "theseDrugNames"] <- unlist(strsplit(drugRelatedness[, "DrugNamesOtherFile"], "_IC_50"))
pairCor <- cor(t(allDrugPredictions_mat_01A_com), method="spearman")
controlPcsList <- list()
for(j in 1:nrow(allDrugPredictions_mat_01A_com))
{
categoryThisDrug <- drugRelatedness[, "Drug.Category"][which(drugRelatedness["theseDrugNames"] == rownames(allDrugPredictions_mat_01A_com)[j])]
negControlDrugs <- na.omit(drugRelatedness[!drugRelatedness[, "Drug.Category"] %in% categoryThisDrug, "theseDrugNames"])
pairwiseCorNear <- names(rank(abs(pairCor[, colnames(t(allDrugPredictions_mat_01A_com))[j]]))[118:137]) ## NB also remove very correlated drugs...
negControlDrugs <- setdiff(negControlDrugs, pairwiseCorNear) # remove very highly correlated drugs from "negative controls"
controlPCsAll <- prcomp(t(allDrugPredictions_mat_01A_com)[, negControlDrugs])$x
controlPcsList[[j]] <- controlPCsAll
}
Run across all CNVs and all drugs with GLDS…..
theFun <- function(j)
{
pVals <- numeric()
betaVal <- numeric()
if(sum(na.omit(bigCnvMatCommonPats_amps[j, ])) > 50) # make sure the gene is amplifed at least 20 times
{
for(i in 1:nrow(allDrugPredictions_mat_01A_com))
{
theMod <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~bigCnvMatCommonPats_amps[j, ]+factor(canTypesVec01A_com)+controlPcsList[[i]][, 1:50]))) # as with the mutation data, control for 50 pcs
pVals[i] <- theMod[2,4]
betaVal[i] <- theMod[2,1]
}
names(pVals) <- rownames(allDrugPredictions_mat_01A_com)
names(betaVal) <- rownames(allDrugPredictions_mat_01A_com)
}
# print(j)
return(list(pVals, betaVal))
}
allCors <- mclapply(1:nrow(bigCnvMatCommonPats), theFun, mc.cores=4)
names(allCors) <- rownames(bigCnvMatCommonPats)
hasAmps <- apply(bigCnvMatCommonPats_amps, 1, function(theRow)return(sum(na.omit(theRow)) > 50))
allCors_hasAmps <- allCors[hasAmps]
pVals <- sapply(allCors_hasAmps, function(item)return(item[[1]]))
betas <- sapply(allCors_hasAmps, function(item)return(item[[2]]))
Print some of the top results and the results for lapatinb and erbb2. Its going to be difficult to get at causality in a systematic way here. Something for future work.
print(sort(unmatrix(pVals))[1:10])
## GSK269962A:LOC100130075 GSK269962A:MDM2 GSK269962A:LYZ
## 8.741670e-40 4.615925e-37 3.491572e-35
## Bortezomib:SAMD12-AS1 Bortezomib:AARD Bortezomib:FZD6
## 4.929592e-34 8.929976e-34 9.043687e-34
## Bortezomib:MIR3151 Bortezomib:SLC30A8 Bortezomib:COLEC10
## 1.455358e-33 1.517479e-33 1.909074e-33
## Bortezomib:BAALC
## 3.375012e-33