In this script I want to apply the models across every tcga sample. I will look at the association of CNVs and drug sensitivity. In particular I am interested in whether the ERBB2 and lapatinib association is recovered here. Load required libraries and set the root directory.

library("pRRophetic")
## Warning: replacing previous import by 'genefilter::Anova' when loading
## 'pRRophetic'
library("parallel")
library("gdata")
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
theRootDir <- "/mnt/data_scratch/finalData/"

Define a simple function to turn one tailed p-values into two tailed p-values (e.g. those from a linear regression)

makeOneTail <- function(pvals, betaVals, alternative="greater") 
{
  outVec <- numeric()
  if(alternative=="greater")
  {
    for(i in 1:length(pvals))
    {
      if((betaVals[i] > 0))
    outVec[i] <- pvals[i] / 2
      else
    outVec[i] <- 1 - (pvals[i] / 2)
    }
  }
  else if(alternative == "less")
  {
    for(i in 1:length(pvals))
    {
      if((betaVals[i] < 0))
    outVec[i] <- pvals[i] / 2
      else
    outVec[i] <- 1 - (pvals[i] / 2)
    }
  }
  names(outVec) <- names(pvals)
  return(outVec)
}

Load the expression data….

load(file=paste(theRootDir, "/dataIn/tenRuvNewStandardApproach.RData", sep="")) # cancerTypesVec, tenRuvNewStandardApproach
colnames(tenRuvNewStandardApproach) <- gsub(".", "-", colnames(tenRuvNewStandardApproach), fixed=T)

Load the cnv data for all of TCGA

diseaseAbbrvs <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "DLBC", "GBM", "HNSC", "KICH", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")
theCnvQuantVecList_mat_list <- list()
tumorSamps_list <- list()
gnamesList <- list()
for(i in 1:length(diseaseAbbrvs))
{
  load(file=paste(theRootDir, "/dataIn/tcga_cnv_subtracted/cnvsMappedToGenes/", diseaseAbbrvs[i], ".RData", sep="")) # theCnvQuantVecList_mat, tumorSamps
  theCnvQuantVecList_mat_list[[i]] <- theCnvQuantVecList_mat
  tumorSamps_list[[i]] <- tumorSamps
  gnamesList[[i]] <- gnamesList
}
bigCnvMat <- do.call(cbind, theCnvQuantVecList_mat_list)

Create a vector of the cancer types that corresponds to the CNV data. I will use this below.

canTypesVec <- character()
for(i in 1:length(diseaseAbbrvs))
{
  thisTypeVec <- rep(diseaseAbbrvs[i], ncol(theCnvQuantVecList_mat_list[[i]]))
  canTypesVec <- c(canTypesVec, thisTypeVec)
}
names(canTypesVec) <- colnames(bigCnvMat)

From bigCnvMat, I need to find the 01A samples and the non-duplicates. 01a samples are the tumor samples.

bigCnv01A <- which(sapply(strsplit(colnames(bigCnvMat), "-"), function(a)a[4]) == "01A")
bigCnv01AMat <- bigCnvMat[, bigCnv01A]
canTypesVec01A <- canTypesVec[bigCnv01A]
patBigCnv <- sapply(strsplit(colnames(bigCnv01AMat), "-"), function(a)a[3])
colnames(bigCnv01AMat) <- patBigCnv # change the colum name to be the patient ids....
names(canTypesVec01A) <- patBigCnv

What effect does including cancer type as a co-variate have on the results? Load the drug prediction data and subset to those patients for which we also have CNV data.

load(file=paste(theRootDir, "dataOut/allDrugPredictions_mat.RData", sep="")) # allDrugPredictions_mat, cancerTypesVec
colnames(allDrugPredictions_mat) <- gsub(".", "-", colnames(allDrugPredictions_mat), fixed=T)
all01ASamples <- colnames(allDrugPredictions_mat)[which(sapply(strsplit(colnames(allDrugPredictions_mat), "-"), function(a)a[4]) == "01A")]
all01patientIds <- sapply(strsplit(all01ASamples, "-"), function(a)a[3])
allDrugPredictions_mat_01A <- allDrugPredictions_mat[, all01ASamples]
colnames(allDrugPredictions_mat_01A) <- all01patientIds
commonPats <- patBigCnv[patBigCnv %in% all01patientIds] # patients common to both cnv and prediction data....
bigCnvMatCommonPats <- bigCnv01AMat[, commonPats]
canTypesVec01A_com <- canTypesVec01A[commonPats]
allDrugPredictions_mat_01A_com <- allDrugPredictions_mat_01A[, commonPats]

Run an analysis for ERBB2 amplification against drug sensitivty, with and without cancer type included as a factor.

pVals <- numeric()
betaVal <- numeric()
pVals_withType <- numeric()
betaVal_withType <- numeric()
erbb2Amp <- as.numeric(bigCnvMatCommonPats["ERBB2",] > 1)
for(i in 1:nrow(allDrugPredictions_mat_01A))
{
  # control for cancer type
  pVals_withType[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp+factor(canTypesVec01A_com))))[2,4]
  betaVal_withType[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp+factor(canTypesVec01A_com))))[2,1]

  # don't control for cancer type
  pVals[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp)))[2,4]
  betaVal[i] <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~erbb2Amp)))[2,1]

  # print(i)
}
names(pVals) <- rownames(allDrugPredictions_mat_01A)
names(pVals_withType) <- rownames(allDrugPredictions_mat_01A)

Print some of the top P-values for lapatinib and ERBB2, with and without controlling for cancer type. These are one tailed statisitcs as we are asking what is more likely to be effective in ERBB2 positive cancers.

psOne_withType <- makeOneTail(pVals_withType, betaVal_withType, alternative="less")
psOne <- makeOneTail(pVals, betaVal, alternative="less")
print(sort(psOne_withType)[1:10])
##          Lapatinib           A.443654         PF.4708671 
##       2.386063e-10       6.307226e-08       7.753145e-08 
## AKT.inhibitor.VIII            MK.2206            WZ.1.84 
##       3.255794e-06       8.739821e-06       1.651061e-05 
##            AZD6482       Bicalutamide           BIBW2992 
##       3.132285e-05       5.815255e-05       7.887527e-05 
##          JW.7.52.1 
##       9.197318e-05
print(sort(psOne)[1:20])
##             MK.2206            A.443654  AKT.inhibitor.VIII 
##        1.534971e-28        4.719192e-21        3.697310e-20 
##          PF.4708671             GDC0941            BIBW2992 
##        1.094027e-16        8.942220e-16        4.509543e-14 
##             AZD6482             X17.AAG           JW.7.52.1 
##        1.307751e-12        9.743849e-11        2.109262e-09 
##         Z.LLNle.CHO               AICAR           GW.441756 
##        2.201576e-08        4.052831e-07        1.630904e-06 
## S.Trityl.L.cysteine             WZ.1.84             BI.2536 
##        1.770288e-06        3.579372e-06        3.990303e-06 
##               PAC.1           Lapatinib           Rapamycin 
##        4.834677e-06        8.568504e-06        1.268821e-05 
##           Pazopanib          CGP.082996 
##        1.716464e-05        2.376135e-05
svg(paste(theRootDir, "figures/lapatinib_noCorrection.svg", sep=""), width=4, height=4)
hist(-log10(psOne), breaks=100, col="#8dd3c7", main="Lapatinib", las=1, xlab=expression("-Log"[10]*"P-value"), cex.axis=0.75)
abline(v=-log10(psOne["Lapatinib"]), col="red")
dev.off()
## png 
##   2
svg(paste(theRootDir, "figures/lapatinib_corrForType.svg", sep=""), width=4, height=4)
hist(-log10(psOne_withType), breaks=100, col="#8dd3c7", main="Lapatinib", las=1, xlab=expression("-Log"[10]*"P-value"), cex.axis=0.75)
abline(v=-log10(psOne_withType["Lapatinib"]), col="red")
dev.off()
## png 
##   2

Test every drug against EVERY CNV, while controlling for cancer type. I will parallelize this over 4 cores. NB, you may want to reduce that number based on the number of available cores.

bigCnvMatCommonPats_amps <- apply(bigCnvMatCommonPats, 2, function(theCol)return(as.numeric(theCol > 1)))
rownames(bigCnvMatCommonPats_amps) <- rownames(bigCnvMatCommonPats)
bigCnvMatCommonPats_dels <- apply(bigCnvMatCommonPats, 2, function(theCol)return(as.numeric(theCol < -1)))
rownames(bigCnvMatCommonPats_dels) <- rownames(bigCnvMatCommonPats)
theFun <- function(j)
{
  pVals <- numeric()
  betaVal <- numeric()

  if(sum(na.omit(bigCnvMatCommonPats_amps[j, ])) > 50) # make sure the gene is amplifed at least 20 times
  {
    for(i in 1:nrow(allDrugPredictions_mat_01A_com))
    {
      theMod <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~bigCnvMatCommonPats_amps[j, ]+factor(canTypesVec01A_com))))
      pVals[i] <- theMod[2,4]
      betaVal[i] <- theMod[2,1]
    }
    names(pVals) <- rownames(allDrugPredictions_mat_01A_com)
    names(betaVal) <- rownames(allDrugPredictions_mat_01A_com)

  }
  return(list(pVals, betaVal))
}

allCors <- mclapply(1:nrow(bigCnvMatCommonPats), theFun, mc.cores=4) # NB: You may want to change this number based on the number of available cores.
names(allCors) <- rownames(bigCnvMatCommonPats)
hasAmps <- apply(bigCnvMatCommonPats_amps, 1, function(theRow)return(sum(na.omit(theRow)) > 50)) # restrict analysis to CNAs that occur in 50 or more samples.
allCors_hasAmps <- allCors[hasAmps]
pVals <- sapply(allCors_hasAmps, function(item)return(item[[1]]))
betas <- sapply(allCors_hasAmps, function(item)return(item[[2]]))

print(sort(unmatrix(pVals))[1:10]) # its going to be difficult to get at causality in a systematic way here....
## OSI.906:RP11-434C1.1   PF.4708671:MIR5708    PF.4708671:MRPS28 
##         2.404729e-37         9.566338e-37         4.901663e-36 
##         OSI.906:PRB3         OSI.906:PRB1         OSI.906:PRB2 
##         2.608545e-35         2.726659e-35         2.726659e-35 
##     PF.4708671:TPD52        OSI.906:KLRC4        OSI.906:KLRD1 
##         3.528239e-35         1.687924e-34         1.693637e-34 
##  OSI.906:KLRC4-KLRK1 
##         1.695912e-34
print(pVals["Lapatinib", "ERBB2"])
## [1] 4.772127e-10

Lets also incorporate glds

drugRelatedness <- read.csv(paste(theRootDir, "dataIn/categorys.csv", sep=""), as.is=TRUE)
theseDrugNames <- rownames(allDrugPredictions_mat_01A_com)
drugRelatedness[, "theseDrugNames"] <- unlist(strsplit(drugRelatedness[, "DrugNamesOtherFile"], "_IC_50"))
pairCor <- cor(t(allDrugPredictions_mat_01A_com), method="spearman")
controlPcsList <- list()
for(j in 1:nrow(allDrugPredictions_mat_01A_com))
{
  categoryThisDrug <- drugRelatedness[, "Drug.Category"][which(drugRelatedness["theseDrugNames"] == rownames(allDrugPredictions_mat_01A_com)[j])]
  negControlDrugs <- na.omit(drugRelatedness[!drugRelatedness[, "Drug.Category"] %in% categoryThisDrug, "theseDrugNames"])
  pairwiseCorNear <- names(rank(abs(pairCor[, colnames(t(allDrugPredictions_mat_01A_com))[j]]))[118:137]) ## NB also remove very correlated drugs... 
  negControlDrugs <- setdiff(negControlDrugs, pairwiseCorNear) # remove very highly correlated drugs from "negative controls"
  controlPCsAll <- prcomp(t(allDrugPredictions_mat_01A_com)[, negControlDrugs])$x
  controlPcsList[[j]] <- controlPCsAll
}

Run across all CNVs and all drugs with GLDS…..

theFun <- function(j)
{
  pVals <- numeric()
  betaVal <- numeric()

  if(sum(na.omit(bigCnvMatCommonPats_amps[j, ])) > 50) # make sure the gene is amplifed at least 20 times
  {
    for(i in 1:nrow(allDrugPredictions_mat_01A_com))
    {
      theMod <- coef(summary(lm(allDrugPredictions_mat_01A_com[i,]~bigCnvMatCommonPats_amps[j, ]+factor(canTypesVec01A_com)+controlPcsList[[i]][, 1:50]))) # as with the mutation data, control for 50 pcs
      pVals[i] <- theMod[2,4]
      betaVal[i] <- theMod[2,1]
    }
    names(pVals) <- rownames(allDrugPredictions_mat_01A_com)
    names(betaVal) <- rownames(allDrugPredictions_mat_01A_com)    
  }
  # print(j)
  return(list(pVals, betaVal))
}
allCors <- mclapply(1:nrow(bigCnvMatCommonPats), theFun, mc.cores=4)
names(allCors) <- rownames(bigCnvMatCommonPats)
hasAmps <- apply(bigCnvMatCommonPats_amps, 1, function(theRow)return(sum(na.omit(theRow)) > 50))
allCors_hasAmps <- allCors[hasAmps]
pVals <- sapply(allCors_hasAmps, function(item)return(item[[1]]))
betas <- sapply(allCors_hasAmps, function(item)return(item[[2]]))

Print some of the top results and the results for lapatinb and erbb2. Its going to be difficult to get at causality in a systematic way here. Something for future work.

print(sort(unmatrix(pVals))[1:10])
## GSK269962A:LOC100130075         GSK269962A:MDM2          GSK269962A:LYZ 
##            8.741670e-40            4.615925e-37            3.491572e-35 
##   Bortezomib:SAMD12-AS1         Bortezomib:AARD         Bortezomib:FZD6 
##            4.929592e-34            8.929976e-34            9.043687e-34 
##      Bortezomib:MIR3151      Bortezomib:SLC30A8      Bortezomib:COLEC10 
##            1.455358e-33            1.517479e-33            1.909074e-33 
##        Bortezomib:BAALC 
##            3.375012e-33