Here, I want to try to classify the type of cancer in TCGA using models derived from cell lines. This file also contains code for creating Figure 1(a). Set the root directory to the location of the data. This must be modifed for your own use (based on the location of the files).
theRootDir <- "/mnt/data_scratch/finalData/"
Load pRRopheitc package
library(pRRophetic)
## Warning: replacing previous import by 'genefilter::Anova' when loading
## 'pRRophetic'
Load the CGP expression data and Cancer type data….
data(drugAndPhenoCgp)
unique(drugSensitivityDataCgp[,"Cancer.Type"])
## [1] "blood" "CNS" "soft tissue"
## [4] "bone" "lung" "skin"
## [7] "bladder" "uterus" "pancreas"
## [10] "upper aerodigestive" "breast" "kidney"
## [13] "ovary" "other" "GI tract"
## [16] "thyroid"
We will need to manually map the TCGA cancers to the cancer types here and identify those which clearly map unambiguosly.
theDirs <- dir(paste(theRootDir, "dataIn/rnaSeq", sep="")) # directory containing all TCGA gene expression data.
cancerTypeNames <- sapply(sapply(strsplit(theDirs, ".", fixed=T), function(a)return(strsplit(a[[3]], "_"))), function(b)return(b[2]))
cancerTypeNames_match_cgp_type <- c("none", "bladder", "breast", "uterus", "none", "GI tract", "GI tract", "blood",
"upper aerodigestive", "CNS", "CNS", "upper aerodigestive", "kidney", "kidney", "kidney",
"kidney", "blood", "CNS", "none", "lung", "lung", "none", "ovary", "pancreas", "none",
"other", "none", "none", "skin", "GI tract", "none", "thyroid", "thyroid", "uterus", "uterus", "skin")
names(cancerTypeNames) <- cancerTypeNames_match_cgp_type
Get the cancer type information for CGP.
cellLineToCancerType <- drugSensitivityDataCgp[,"Cancer.Type"]
names(cellLineToCancerType) <- drugSensitivityDataCgp[, "Cell.Line"]
These tissues have a large number of samples in both CGP and TCGA, and are unambiguously annotated.
tissuesUsing <- c("breast", "skin", "CNS", "blood", "lung")
relevantCellLines <- drugSensitivityDataCgp[drugSensitivityDataCgp[,"Cancer.Type"] %in% tissuesUsing, c("Cell.Line", "Cancer.Type")]
celFilesList <- list()
for(i in 1:length(tissuesUsing))
{
celFilesList[[i]] <- drugToCellLineDataCgp[drugToCellLineDataCgp[, "Source.Name"] %in% relevantCellLines[relevantCellLines[,2] %in% tissuesUsing[i], 1], "Array.Data.File"]
}
names(celFilesList) <- tissuesUsing
typeVec <- character()
for(i in 1:length(tissuesUsing))
{
typeVec <- c(typeVec, rep(tissuesUsing[i], length(celFilesList[[i]])))
}
geneExprMat <- gdsc_brainarray_syms[, do.call(c, celFilesList)]
df <- data.frame(typeVec, t(geneExprMat))
Load the TCGA data. This is the raw TPM created in batch_correct_tcga_data.R. Also load the RUV batch corrected data.
load(file=paste(theRootDir, "dataIn/allExprData.RData", sep="")) # allExprData
load(file=paste(theRootDir, "dataIn/cancerTypesVec.RData", sep="")) # cancerTypesVec
## Warning in readChar(con, 5L, useBytes = TRUE): cannot open compressed file
## '/mnt/data_scratch/finalData/dataIn/cancerTypesVec.RData', probable reason
## 'No such file or directory'
## Error in readChar(con, 5L, useBytes = TRUE): cannot open the connection
load(file=paste(theRootDir, "dataIn/tenRuvNewStandardApproach.RData", sep="")) # tenRuvNewStandardApproach, cancerTypesVec
Homoginize data, fit a logistic mode with GLMnet. This will combine the datasets by standardizing the mean and variance for each gene to 0 and 1 respectively and subsetting to only genes that are represeted in each dataset.
homUncorrData <- homogenizeData(allExprData, geneExprMat, batchCorrect="standardize", selection=1, printOutput=TRUE)
##
## 11503 gene identifiers overlap between the supplied expression matrices...
##
Fit a glmnet logistic model
library("glmnet")
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:base':
##
## crossprod, tcrossprod
## Loading required package: foreach
## foreach: simple, scalable parallel programming from Revolution Analytics
## Use Revolution R for scalability, fault tolerance and more.
## http://www.revolutionanalytics.com
## Loaded glmnet 2.0-2
trainExpr <- t(homUncorrData$train)
trainPtyle <- as.factor(typeVec)
trainDat <- data.frame(trainPtyle, trainExpr)
Fit <- cv.glmnet(trainExpr, trainPtyle, family="multinomial", alpha=0) # alpha = 0 is the ridge penalty.
preDataLRR <- data.frame(t(homUncorrData$test))
Pred <- predict(Fit,t(homUncorrData$test))
# tissuesOrd("blood", "breast", "CNS", "lung", "skin")
classification <- character()
for(i in 1:ncol(homUncorrData$test))
{
classification[i] <- names(which.max(Pred[i,1:5,1]))
}
Show the numbers of things that are correctly and incorrectly classified for each cancer type.
breastCorrect <- sum(classification[cancerTypesVec == "BRCA"] == "breast")
breastIncorrect <- sum(classification[cancerTypesVec == "BRCA"] != "breast")
print(breastCorrect)
## [1] 816
print(breastIncorrect)
## [1] 396
bloodCorrect <- sum(classification[cancerTypesVec == "LAML"] == "blood")
bloodIncorrect <- sum(classification[cancerTypesVec == "LAML"] != "blood")
print(bloodCorrect)
## [1] 173
print(bloodIncorrect)
## [1] 0
lungCorrect <- sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] == "lung")
lungIncorrect <- sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] != "lung")
print(lungCorrect)
## [1] 886
print(lungIncorrect)
## [1] 242
skinCorrect <- sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] == "skin")
skinIncorrect <- sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] != "skin")
print(skinCorrect)
## [1] 462
print(skinIncorrect)
## [1] 90
cnsCorrect <- sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] == "CNS")
cnsIncorrect <- sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] != "CNS")
print(cnsCorrect)
## [1] 692
print(cnsIncorrect)
## [1] 4
Total number of samples correctly classified
totalCorrect_noBatchCorr <- sum(c(breastCorrect, bloodCorrect, lungCorrect, skinCorrect, cnsCorrect))
print(totalCorrect_noBatchCorr)
## [1] 3029
Total number of samples incorrectly classified
totalIncorrect_noBatchCorr <- sum(c(breastIncorrect, bloodIncorrect, lungIncorrect, skinIncorrect, cnsIncorrect))
print(totalIncorrect_noBatchCorr)
## [1] 732
dat <- c((breastCorrect/(breastCorrect+breastIncorrect)), (bloodCorrect/(bloodCorrect+bloodIncorrect)), (lungCorrect/(lungCorrect+lungIncorrect)), (skinCorrect/(skinCorrect+skinIncorrect)), (cnsCorrect/(cnsCorrect+cnsIncorrect)))
labels <- c("Breast", "Blood", "Lung", "Skin", "CNS")
cols <- c("#7fc97f", "#beaed4", "#fdc086", "#ffff99", "#386cb0")
svg(paste(theRootDir, "figures/barplot_noBatchCorrection.svg", sep=""), width=5, height=5.5)
par(mar=c(8.1,4.1,3.1,2.1))
barplot(dat, ylab="Proportion Correctly Classified", col=cols, names.arg=c("Breast (n=1212)", "Blood (n=173)", "Lung (n=1128)", "Skin (n=552)", "CNS (n=696)"), las=2, cex.axis=.7)
dev.off()
## png
## 2
Compare the performance of this to the batch corrected data.
homUncorrData <- homogenizeData(tenRuvNewStandardApproach, geneExprMat, batchCorrect="standardize", selection=1, printOutput=TRUE)
##
## 11503 gene identifiers overlap between the supplied expression matrices...
##
Fit a glmnet logistic model
library("glmnet")
trainExpr <- t(homUncorrData$train)
trainPtyle <- as.factor(typeVec)
trainDat <- data.frame(trainPtyle, trainExpr)
Fit <- cv.glmnet(trainExpr, trainPtyle, family="multinomial", alpha=0) # alpha = 0 is the ridge penalty.
preDataLRR <- data.frame(t(homUncorrData$test))
Pred <- predict(Fit,t(homUncorrData$test))
# tissuesOrd("blood", "breast", "CNS", "lung", "skin")
classification <- character()
for(i in 1:ncol(homUncorrData$test))
{
classification[i] <- names(which.max(Pred[i,1:5,1]))
}
Show the numbers of things that are correctly and incorrectly classified for each cancer type.
breastCorrect <- sum(classification[cancerTypesVec == "BRCA"] == "breast")
breastIncorrect <- sum(classification[cancerTypesVec == "BRCA"] != "breast")
print(breastCorrect)
## [1] 838
print(breastIncorrect)
## [1] 374
bloodCorrect <- sum(classification[cancerTypesVec == "LAML"] == "blood")
bloodIncorrect <- sum(classification[cancerTypesVec == "LAML"] != "blood")
print(bloodCorrect)
## [1] 173
print(bloodIncorrect)
## [1] 0
lungCorrect <- sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] == "lung")
lungIncorrect <- sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] != "lung")
print(lungCorrect)
## [1] 962
print(lungIncorrect)
## [1] 166
skinCorrect <- sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] == "skin")
skinIncorrect <- sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] != "skin")
print(skinCorrect)
## [1] 475
print(skinIncorrect)
## [1] 77
cnsCorrect <- sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] == "CNS")
cnsIncorrect <- sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] != "CNS")
print(cnsCorrect)
## [1] 688
print(cnsIncorrect)
## [1] 8
Total number of samples correctly classified
totalCorrect_withBatchCorr <- sum(c(breastCorrect, bloodCorrect, lungCorrect, skinCorrect, cnsCorrect))
print(totalCorrect_withBatchCorr)
## [1] 3136
Total number of samples incorrectly classified
totalIncorrect_withBatchCorr <- sum(c(breastIncorrect, bloodIncorrect, lungIncorrect, skinIncorrect, cnsIncorrect))
print(totalIncorrect_withBatchCorr)
## [1] 625
dat <- c((breastCorrect/(breastCorrect+breastIncorrect)), (bloodCorrect/(bloodCorrect+bloodIncorrect)), (lungCorrect/(lungCorrect+lungIncorrect)), (skinCorrect/(skinCorrect+skinIncorrect)), (cnsCorrect/(cnsCorrect+cnsIncorrect)))
labels <- c("Breast", "Blood", "Lung", "Skin", "CNS")
cols <- c("#7fc97f", "#beaed4", "#fdc086", "#ffff99", "#386cb0")
svg(paste(theRootDir, "figures/barplot_withBatchCorrection.svg", sep=""), width=5, height=5.5)
par(mar=c(8.1,4.1,3.1,2.1))
barplot(dat, ylab="Proportion Correctly Classified", col=cols, names.arg=c("Breast (n=1212)", "Blood (n=173)", "Lung (n=1128)", "Skin (n=552)", "CNS (n=696)"), las=2, cex.axis=.7)
dev.off()
## png
## 2
Permute the labels on the training data. We find that everything will be classified as “lung” because there is no signal and lung is the most common cancer type in the training set….
trainExpr <- t(homUncorrData$train)
trainPtyle <- as.factor(sample(typeVec))
trainDat <- data.frame(trainPtyle, trainExpr)
Fit <- cv.glmnet(trainExpr, trainPtyle, family="multinomial", alpha=0) # alpha = 0 is the ridge penalty.
preDataLRR <- data.frame(t(homUncorrData$test))
Pred <- predict(Fit,t(homUncorrData$test))
classification <- character()
for(i in 1:ncol(homUncorrData$test))
{
classification[i] <- names(which.max(Pred[i,1:5,1]))
}
Show the numbers of things that are correctly and incorrectly classified. Everything has been classified as lung.
print(sum(classification[cancerTypesVec == "BRCA"] == "breast"))
## [1] 0
print(sum(classification[cancerTypesVec == "BRCA"] != "breast"))
## [1] 1212
print(sum(classification[cancerTypesVec == "LAML"] == "blood"))
## [1] 0
print(sum(classification[cancerTypesVec == "LAML"] != "blood"))
## [1] 173
print(sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] == "lung"))
## [1] 1128
print(sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] != "lung"))
## [1] 0
print(sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] == "skin"))
## [1] 0
print(sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] != "skin"))
## [1] 552
print(sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] == "CNS"))
## [1] 0
print(sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] != "CNS"))
## [1] 696
print(sum(classification == "lung"))
## [1] 9968
print(table(typeVec))
## typeVec
## blood breast CNS lung skin
## 118 42 82 135 42
lets establish significance by shuffling the labels on the TCGA breast, blood, lung, skin and CNS data. This is highly significant.
cancerTypeNames
## none bladder breast
## "ACC" "ACC" "BLCA"
## uterus none GI tract
## "BLCA" "BRCA" "BRCA"
## GI tract blood upper aerodigestive
## "CESC" "CESC" "CHOL"
## CNS CNS upper aerodigestive
## "CHOL" "COAD" "COAD"
## kidney kidney kidney
## "COADREAD" "COADREAD" "DLBC"
## kidney blood CNS
## "DLBC" "ESCA" "ESCA"
## none lung lung
## "GBMLGG" "GBMLGG" "GBM"
## none ovary pancreas
## "GBM" "HNSC" "HNSC"
## none other none
## "KICH" "KICH" "KIPAN"
## none skin GI tract
## "KIPAN" "KIRC" "KIRC"
## none thyroid thyroid
## "KIRP" "KIRP" "LAML"
## uterus uterus skin
## "LAML" "LGG" "LGG"
## <NA> <NA> <NA>
## "LIHC" "LIHC" "LUAD"
## <NA> <NA> <NA>
## "LUAD" "LUSC" "LUSC"
## <NA> <NA> <NA>
## "MESO" "MESO" "OV"
## <NA> <NA> <NA>
## "OV" "PAAD" "PAAD"
## <NA> <NA> <NA>
## "PCPG" "PCPG" "PRAD"
## <NA> <NA> <NA>
## "PRAD" "READ" "READ"
## <NA> <NA> <NA>
## "SARC" "SARC" "SKCM"
## <NA> <NA> <NA>
## "SKCM" "STES" "STES"
## <NA> <NA> <NA>
## "TGCT" "TGCT" "THCA"
## <NA> <NA> <NA>
## "THCA" "THYM" "THYM"
## <NA> <NA> <NA>
## "UCEC" "UCEC" "UCS"
## <NA> <NA> <NA>
## "UCS" "UVM" "UVM"
ctypes <- cancerTypesVec[names(cancerTypeNames) %in% c("skin", "breast", "blood", "lung", "CNS")]
theVec <- c(rep("breast", sum(cancerTypesVec == "BRCA")), rep("blood", sum(cancerTypesVec == "LAML")), rep("lung", sum(cancerTypesVec %in% c("LUAD", "LUSC"))), rep("skin", sum(cancerTypesVec %in% c("SKCM", "UVM"))), rep("CNS", sum(cancerTypesVec %in% c("GBMLGG", "GBM", "LGG"))))
sumOut <- numeric()
for(i in 1:100000)
{
sumOut[i] <- sum(theVec == sample(theVec))
}
m <- mean(sumOut)
theSd <- sd(sumOut)
print(m)
## [1] 946.8239
print(theSd)
## [1] 25.64501
print(pnorm(sum(c(breastCorrect, bloodCorrect, lungCorrect, skinCorrect, cnsCorrect)), mean=m, sd=theSd, lower.tail=F))
## [1] 0
Is the change in the number of correctly classified samples significantly improved by our batch correction approach (indicating the batch correction was effective in enriching for biological signal)? Yes it is.
print(fisher.test(matrix(c(totalCorrect_withBatchCorr, totalIncorrect_withBatchCorr, totalCorrect_noBatchCorr, totalIncorrect_noBatchCorr), nrow=2)))
##
## Fisher's Exact Test for Count Data
##
## data:
## p-value = 0.001474
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.075913 1.366780
## sample estimates:
## odds ratio
## 1.212538