#' In this script we will investigate the association between ERBB2 amplification and lapatinib sensitivty in the CGP cell lines.

#' Load libraries
library("GenomicRanges")
library("pRRophetic")
library("ggplot2")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")

#' Set the root directory to the location of the data. This must be modifed for your own use (based on the location of the files).
theRootDir <- "/mnt/data_scratch/finalData/"

#' Load the CNV data from CGP. This was acquired from cancerrxgene.org. Generated using Affymetrix SNP 6.0 data. This data was not mapped to genes by CGP, so we do that below. The strategy for mapping genes to copy number regions used here is identical to that which we applied to the TCGA data. For the sake of long-term reproducibility I have re-posted this file to GitHub.
download.file("https://raw.github.com/paulgeeleher/tcgaData/master/cgp_cnv_data/cell_lines_copy_number.csv", paste(theRootDir, "dataIn/cell_lines_copy_number.csv", sep=""))
allCnvs <- read.csv(paste(theRootDir, "dataIn/cell_lines_copy_number.csv", sep=""), as.is=T, header=T)
cellLines_cnv_list <- split(allCnvs, allCnvs[,1])

#' For each cell line in CGP, create a GenomicRanges object, which contains the locations and magnitutes of all Copy Number measurements. This data is GRCh37/hg19.
grCnvsList <- list()
for(i in 1:length(cellLines_cnv_list))
{
  grCnvsList[[i]] <- GRanges(seqnames=Rle(paste("chr", cellLines_cnv_list[[i]][, "chr_37"], sep="")), ranges=IRanges(cellLines_cnv_list[[i]][, "startpos_37"], cellLines_cnv_list[[i]][, "endpos_37"]), segMeans=cellLines_cnv_list[[i]][, "totalCN"])
}
names(grCnvsList) <- names(cellLines_cnv_list)

#' Load the gene ranges for HG19 using
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
library(GenomicFeatures)
geneRanges <- genes(txdb)
library(org.Hs.eg.db)
e2s = toTable(org.Hs.egSYMBOL)
syms <- e2s[, "symbol"]
names(syms) <- e2s[, "gene_id"]
theGeneSymsOrd <- syms[as.character(geneRanges$gene_id)]

#' We will now intersect the gene ranges with the CNV data in order to establish the copy number for each gene. This is identical to the strategy we use to map genes to CNVs in TCGA.
numGenesQuantifid <- numeric()
theCnvQuantVecList <- list()
for(i in 1:length(grCnvsList))
{
    grCnvs <- grCnvsList[[i]]

    # Use count overlaps to find genes that unambiguously overlap a single peak. Give it an NA it it doesn't overlap a single peak. Assign it the value of the peak if it unambiguously overlaps a peak. PC.
    numOverlaps <- countOverlaps(geneRanges, grCnvs)
    numGenesQuantifid[i] <- sum(numOverlaps == 1)
    inCnv <- which(numOverlaps == 1) # take only gene unambiguously overlaping a peak, this is usually most genes.
    
    theCnvQuantVec <- rep(NA, length(geneRanges))
    olaps <- findOverlaps(geneRanges, grCnvs, type="within", ignore.strand=TRUE)
    theCnvQuantVec[olaps@queryHits] <- grCnvs$segMeans[olaps@subjectHits]
    theCnvQuantVecList[[i]] <- theCnvQuantVec
    names(theCnvQuantVecList[[i]]) <- theGeneSymsOrd
}
names(theCnvQuantVecList) <- names(grCnvsList)
theCnvQuantVecList_mat <- do.call(cbind, theCnvQuantVecList)
erbb2CnvVec <- theCnvQuantVecList_mat["ERBB2",]

#' We will now do some analysis testing the gene CVNs against lapatinib sensitivty here in CGP.
#' Load the IC50 data in CGP.
data(drugAndPhenoCgp) # this data is included in the pRRopheic package. It is also available from cancerrxgene.org.
lapatinib_ic50 <- as.numeric(drugSensitivityDataCgp[, "Lapatinib_IC_50"])
names(lapatinib_ic50) <- drugSensitivityDataCgp[, "Cell.Line"]
lapatinib_ic50_noNas <- na.omit(lapatinib_ic50)
cancerTypes <- drugSensitivityDataCgp[, "Cancer.Type"]
names(cancerTypes) <- drugSensitivityDataCgp[, "Cell.Line"]
commonCellLines <- names(erbb2CnvVec)[names(erbb2CnvVec) %in% names(lapatinib_ic50_noNas)]
commonCellLines_types <- cancerTypes[commonCellLines]

#' ERBB2 copy number and lapatinib response are associated in CGP, however, this association is not as strong as we find in TCGA... (this is for ALL cell lines, not just breast cancer).
cor.test(erbb2CnvVec[commonCellLines], lapatinib_ic50_noNas[commonCellLines])
l <- split(lapatinib_ic50_noNas[commonCellLines], erbb2CnvVec[commonCellLines])
boxplot(l, col="lightgrey", xlab="CNV", ylab="Lapatinib IC50")
stripchart(l, vertical=T, pch=20, method="jitter", add=T, col="#00000044")

#' If we compare "amplifed" against "not amplified", it is borderline significant in the CGP data (across all cell lines)
print(wilcox.test(lapatinib_ic50_noNas[commonCellLines][erbb2CnvVec[commonCellLines] > 2], lapatinib_ic50_noNas[commonCellLines][erbb2CnvVec[commonCellLines] == 2]))

#' If we look for the association between ERBB2 amplificaion and laptinib in just breast cancer samples we do a little better.
cnvsBreast <- erbb2CnvVec[commonCellLines][commonCellLines_types == "breast"]
ic50sBreast <- lapatinib_ic50_noNas[commonCellLines][commonCellLines_types == "breast"]
print(wilcox.test(ic50sBreast[cnvsBreast == 2], ic50sBreast[cnvsBreast > 2]))

#' Test the associations between copy number amplification of each gene and lapatinib sensitivty....
pVals <- numeric()
effectSize <- numeric()
for(i in 1:nrow(theCnvQuantVecList_mat))
{
  cnvsBreast <- theCnvQuantVecList_mat[i,][commonCellLines][commonCellLines_types == "breast"]
  ic50sBreast <- lapatinib_ic50_noNas[commonCellLines][commonCellLines_types == "breast"]

  if((sum(na.omit(cnvsBreast) == 2) > 2) & (sum(na.omit(cnvsBreast) > 2) > 2))
  {
  pVals[i] <- t.test(ic50sBreast[cnvsBreast == 2], ic50sBreast[cnvsBreast > 2])$p.value
  effectSize[i] <- mean(na.omit(ic50sBreast[cnvsBreast > 2])) - mean(na.omit(ic50sBreast[cnvsBreast == 2]))
  }
}
names(pVals) <- rownames(theCnvQuantVecList_mat)
resMat <- cbind(pVals, effectSize)
names(resMat) <- rownames(theCnvQuantVecList_mat)

#' Make the plot for the cell line data for association between lapatinib sensitivity and of copy number in the HER2 amplicaon (i.e. the ERBB2 locus). This is Figure 3(d).
#' The locations of the genes in the HER2 locus. I.e. the same genes we plotted on Figure 3(b). File generated in breast_cancer_cnv_analysis.R.
load(paste(theRootDir, "dataOut/resLocs_2.RData", sep=""))

#' Extract the data that we will use in the plot.
startVec <- abs(resLocs_2$start)
endVec <- abs(resLocs_2$end)
midVec <- ((startVec + endVec) / 2)
textVec <- names(resMat[resLocs_2$gene_sym, 2])
yVec <- resMat[resLocs_2$gene_sym, 2] # Effect size.
pVec <- resMat[resLocs_2$gene_sym, 1] # P-value.

#' Overlay p-values on the plot above with GGplot2.... This is figure 3(d)
dat <- data.frame(x=as.numeric(midVec/1000000), y=as.numeric(yVec), Drug=textVec, pVal=-log10(pVec))
svg(paste(theRootDir, "figures/Erbb2Loc_points_GDSC.svg", sep=""), width=7, height=5)
ggplot(data=dat, aes(x=x, y=y)) + theme_bw() + geom_point(aes(color=pVal), size=I(3)) + geom_text(aes(label=Drug), vjust=-.5, hjust=-.24, size=2.5, angle=15) + ylab("Lapatinib Effect Size for HER2+ vs HER2-") + xlab("Chromosome 17 Location (megabases)") + scale_color_continuous(low="steelblue4",high="tomato2", name="-Log10 P-value") + theme(legend.position=c(.9,.2))
dev.off()


#' This is figure 3(c), the scatterplot of lapatinib response against ERBB2 CNV in breast cancer cell lines in GDSC....
cnvsBreast <- theCnvQuantVecList_mat["ERBB2",][commonCellLines][commonCellLines_types == "breast"]
ic50sBreast <- lapatinib_ic50_noNas[commonCellLines][commonCellLines_types == "breast"]
svg(paste(theRootDir, "figures/her2InGDSC.svg", sep=""), width=3, height=3)
plot(cnvsBreast, ic50sBreast, pch=19, col="#00000066", xlab="HER2 Copy Number in GDSC", ylab="Measured Lapatinib IC50", las=1, cex.axis=.8)
dev.off()








