In this script we will investiage the association between ERLIN2 amplification and vinorelbine sensitivty in the CGP cell lines. Load libraries

library("GenomicRanges")
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:stats':
## 
##     xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library("pRRophetic")
## Warning: replacing previous import by 'genefilter::Anova' when loading
## 'pRRophetic'
library("ggplot2")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

Set the root directory to the location of the data. This must be modifed for your own use (based on the location of the files).

theRootDir <- "/mnt/data_scratch/finalData/"

Load the CNV data from CGP. This was acquired from cancerrxgene.org. Generated using Affymetrix SNP 6.0 data. This data was not mapped to genes by CGP, so we do that below. The strategy for mapping genes to copy number regions used here is identical to that which we applied to the TCGA data. For the sake of long-term reproducibility I have re-posted this file to GitHub.

allCnvs <- read.csv(paste(theRootDir, "dataIn/cell_lines_copy_number.csv", sep=""), as.is=T, header=T)
cellLines_cnv_list <- split(allCnvs, allCnvs[,1])

For each cell line in CGP, create a GenomicRanges object, which contains the locations and magnitutes of all Copy Number measurements. This data is GRCh37/hg19.

grCnvsList <- list()
for(i in 1:length(cellLines_cnv_list))
{
  grCnvsList[[i]] <- GRanges(seqnames=Rle(paste("chr", cellLines_cnv_list[[i]][, "chr_37"], sep="")), ranges=IRanges(cellLines_cnv_list[[i]][, "startpos_37"], cellLines_cnv_list[[i]][, "endpos_37"]), segMeans=cellLines_cnv_list[[i]][, "totalCN"])
}
names(grCnvsList) <- names(cellLines_cnv_list)

Load the gene ranges for HG19 using

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
library(GenomicFeatures)
geneRanges <- genes(txdb)
library(org.Hs.eg.db)
## Loading required package: DBI
## 
e2s = toTable(org.Hs.egSYMBOL)
syms <- e2s[, "symbol"]
names(syms) <- e2s[, "gene_id"]
theGeneSymsOrd <- syms[as.character(geneRanges$gene_id)]

We will now intersect the gene ranges with the CNV data in order to establish the copy number for each gene. This is identical to the strategy we use to map genes to CNVs in TCGA.

numGenesQuantifid <- numeric()
theCnvQuantVecList <- list()
for(i in 1:length(grCnvsList))
{
    grCnvs <- grCnvsList[[i]]

    # Use count overlaps to find genes that unambiguously overlap a single peak. Give it an NA it it doesn't overlap a single peak. Assign it the value of the peak if it unambiguously overlaps a peak. PC.
    numOverlaps <- countOverlaps(geneRanges, grCnvs)
    numGenesQuantifid[i] <- sum(numOverlaps == 1)
    inCnv <- which(numOverlaps == 1) # take only gene unambiguously overlaping a peak, this is usually most genes.

    theCnvQuantVec <- rep(NA, length(geneRanges))
    olaps <- findOverlaps(geneRanges, grCnvs, type="within", ignore.strand=TRUE)
    theCnvQuantVec[olaps@queryHits] <- grCnvs$segMeans[olaps@subjectHits]
    theCnvQuantVecList[[i]] <- theCnvQuantVec
    names(theCnvQuantVecList[[i]]) <- theGeneSymsOrd
}
names(theCnvQuantVecList) <- names(grCnvsList)
theCnvQuantVecList_mat <- do.call(cbind, theCnvQuantVecList)
erlin2CnvVec <- theCnvQuantVecList_mat["ERLIN2",]

We will now do some analysis testing the gene CVNs against vinorelbine sensitivty here in CGP. Load the IC50 data in CGP.

data(drugAndPhenoCgp) # this data is included in the pRRopheic package. It is also available from cancerrxgene.org.
vinorelbine_ic50 <- as.numeric(drugSensitivityDataCgp[, "Vinorelbine_IC_50"])
names(vinorelbine_ic50) <- drugSensitivityDataCgp[, "Cell.Line"]
vinorelbine_ic50_noNas <- na.omit(vinorelbine_ic50)
cancerTypes <- drugSensitivityDataCgp[, "Cancer.Type"]
names(cancerTypes) <- drugSensitivityDataCgp[, "Cell.Line"]
commonCellLines <- names(erlin2CnvVec)[names(erlin2CnvVec) %in% names(vinorelbine_ic50_noNas)]
commonCellLines_types <- cancerTypes[commonCellLines]

ERLIN2 copy number and vinorelbine response are associated in CGP, however, this association is not as strong as we find in TCGA… (this is for ALL cell lines, not just breast cancer).

cor.test(erlin2CnvVec[commonCellLines], vinorelbine_ic50_noNas[commonCellLines])
## 
##  Pearson's product-moment correlation
## 
## data:  erlin2CnvVec[commonCellLines] and vinorelbine_ic50_noNas[commonCellLines]
## t = 1.1378, df = 643, p-value = 0.2556
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03248778  0.12160323
## sample estimates:
##        cor 
## 0.04482434
l <- split(vinorelbine_ic50_noNas[commonCellLines], erlin2CnvVec[commonCellLines])
boxplot(l, col="lightgrey", xlab="CNV", ylab="Vinorelbine IC50")
stripchart(l, vertical=T, pch=20, method="jitter", add=T, col="#00000044")

plot of chunk unnamed-chunk-8

If we compare “amplifed” against “not amplified”, it is borderline significant in the CGP data (across all cell lines)

print(wilcox.test(vinorelbine_ic50_noNas[commonCellLines][erlin2CnvVec[commonCellLines] > 2], vinorelbine_ic50_noNas[commonCellLines][erlin2CnvVec[commonCellLines] == 2]))
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  vinorelbine_ic50_noNas[commonCellLines][erlin2CnvVec[commonCellLines] >  and vinorelbine_ic50_noNas[commonCellLines][erlin2CnvVec[commonCellLines] ==     2] and     2]
## W = 48411, p-value = 0.1396
## alternative hypothesis: true location shift is not equal to 0

If we look for the association between ERLIN2 amplificaion and vinorelbine in just breast cancer samples we do a little better.

cnvsBreast <- erlin2CnvVec[commonCellLines][commonCellLines_types == "breast"]
ic50sBreast <- vinorelbine_ic50_noNas[commonCellLines][commonCellLines_types == "breast"]
print(wilcox.test(ic50sBreast[cnvsBreast == 2], ic50sBreast[cnvsBreast > 2]))
## 
##  Wilcoxon rank sum test
## 
## data:  ic50sBreast[cnvsBreast == 2] and ic50sBreast[cnvsBreast > 2]
## W = 116, p-value = 0.06836
## alternative hypothesis: true location shift is not equal to 0

Test the associations between copy number amplification of each gene and vinorelbine sensitivty….

pVals <- numeric()
effectSize <- numeric()
for(i in 1:nrow(theCnvQuantVecList_mat))
{
  cnvsBreast <- theCnvQuantVecList_mat[i,][commonCellLines][commonCellLines_types == "breast"]
  ic50sBreast <- vinorelbine_ic50_noNas[commonCellLines][commonCellLines_types == "breast"]

  if((sum(na.omit(cnvsBreast) == 2) > 2) & (sum(na.omit(cnvsBreast) > 2) > 2))
  {
  pVals[i] <- t.test(ic50sBreast[cnvsBreast == 2], ic50sBreast[cnvsBreast > 2])$p.value
  effectSize[i] <- mean(na.omit(ic50sBreast[cnvsBreast > 2])) - mean(na.omit(ic50sBreast[cnvsBreast == 2]))
  }
}
names(pVals) <- rownames(theCnvQuantVecList_mat)
resMat <- cbind(pVals, effectSize)
names(resMat) <- rownames(theCnvQuantVecList_mat)

Make the plot for the cell line data for association between vinorelbine sensitivity and of copy number in the ERLIN2 locus. This is Figure 3(d). (from breast_cancer_cnv_analysis.R) The locations of the genes in the ERLIN2 locus. I.e. the same genes we plotted on Figure 3©. File generated in breast_cancer_cnv_analysis.R.

load(paste(theRootDir, "dataOut/resLocs_vBine_2.RData", sep=""))

Extract the data that we will use in the plot.

startVec <- abs(resLocs_vBine_2$start)
endVec <- abs(resLocs_vBine_2$end)
midVec <- ((startVec + endVec) / 2)
textVec <- names(resMat[resLocs_vBine_2$gene_sym, 2])
yVec <- resMat[resLocs_vBine_2$gene_sym, 2] # Effect size.
pVec <- resMat[resLocs_vBine_2$gene_sym, 1] # P-value.

Overlay p-values on the plot above with GGplot2….

dat <- data.frame(x=as.numeric(midVec/1000000), y=as.numeric(yVec), Drug=textVec, pVal=-log10(pVec))
svg(paste(theRootDir, "figures/Erlin2Loc_points_GDSC.svg", sep=""), width=7, height=5)
ggplot(data=dat, aes(x=x, y=y)) + theme_bw() + geom_point(aes(color=pVal), size=I(3)) + geom_text(aes(label=Drug), vjust=-.5, hjust=-.24, size=2.5, angle=15) + ylab("Vinorelbine Effect Size for ERLIN2 amp vs ERLIN2 wt") + xlab("Chromosome 8 Location (megabases)") + scale_color_continuous(low="steelblue4",high="tomato2", name="-Log10 P-value") + theme(legend.position=c(.9,.2))
dev.off()
## png 
##   2

This is figure 3(b),

cnvsBreast <- theCnvQuantVecList_mat["ERLIN2",][commonCellLines][commonCellLines_types == "breast"]
ic50sBreast <- vinorelbine_ic50_noNas[commonCellLines][commonCellLines_types == "breast"]
svg(paste(theRootDir, "figures/erlin2InGDSC.svg", sep=""), width=3, height=3)
plot(cnvsBreast, ic50sBreast, pch=19, col="#00000066", xlab="ERLIN2 Copy Number in GDSC", ylab="Measured Vinorelbine IC50", las=1, cex.axis=.8)
dev.off()
## png 
##   2