#' Here, I want to try to classify the type of cancer in TCGA using models derived from cell lines. This file also contains code for creating Figure 1(a).

#' Set the root directory to the location of the data. This must be modifed for your own use (based on the location of the files).
theRootDir <- "/mnt/data_scratch/finalData/"

#' Load pRRopheitc package
library(pRRophetic)

#' Load the CGP expression data and Cancer type data....
data(drugAndPhenoCgp)
unique(drugSensitivityDataCgp[,"Cancer.Type"])


#' We will need to manually map the TCGA cancers to the cancer types here and identify those which clearly map unambiguosly.
theDirs <- dir(paste(theRootDir, "dataIn/rnaSeq", sep="")) # directory containing all TCGA gene expression data.
cancerTypeNames <- sapply(sapply(strsplit(theDirs, ".", fixed=T), function(a)return(strsplit(a[[3]], "_"))), function(b)return(b[2]))
cancerTypeNames_match_cgp_type <- c("none", "bladder", "breast", "uterus", "none", "GI tract", "GI tract", "blood", 
                                    "upper aerodigestive", "CNS", "CNS", "upper aerodigestive", "kidney", "kidney", "kidney",
                                    "kidney", "blood", "CNS", "none", "lung", "lung", "none", "ovary", "pancreas", "none", 
                                    "other", "none", "none", "skin", "GI tract", "none", "thyroid", "thyroid", "uterus", "uterus", "skin")
names(cancerTypeNames) <- cancerTypeNames_match_cgp_type

#' Get the cancer type information for CGP.
cellLineToCancerType <- drugSensitivityDataCgp[,"Cancer.Type"]
names(cellLineToCancerType) <- drugSensitivityDataCgp[, "Cell.Line"]

#' These tissues have a large number of samples in both CGP and TCGA, and are unambiguously annotated.
tissuesUsing <- c("breast", "skin", "CNS", "blood", "lung")
relevantCellLines <- drugSensitivityDataCgp[drugSensitivityDataCgp[,"Cancer.Type"] %in% tissuesUsing, c("Cell.Line", "Cancer.Type")]

celFilesList <- list()
for(i in 1:length(tissuesUsing))
{
  celFilesList[[i]] <- drugToCellLineDataCgp[drugToCellLineDataCgp[, "Source.Name"] %in% relevantCellLines[relevantCellLines[,2] %in% tissuesUsing[i], 1], "Array.Data.File"]
}
names(celFilesList) <- tissuesUsing

typeVec <- character()
for(i in 1:length(tissuesUsing))
{
  typeVec <- c(typeVec, rep(tissuesUsing[i], length(celFilesList[[i]])))
}

geneExprMat <- gdsc_brainarray_syms[, do.call(c, celFilesList)]
df <- data.frame(typeVec, t(geneExprMat))

#' Load the TCGA data. This is the raw TPM created in batch_correct_tcga_data.R. Also load the RUV batch corrected data.
load(file=paste(theRootDir, "dataIn/allExprData.RData", sep="")) # allExprData
# load(file=paste(theRootDir, "dataIn/cancerTypesVec.RData", sep="")) # cancerTypesVec
load(file=paste(theRootDir, "dataIn/tenRuvNewStandardApproach.RData", sep="")) # tenRuvNewStandardApproach, cancerTypesVec


#' Homoginize data, fit a logistic mode with GLMnet. This will 
#' combine the datasets by standardizing the mean and variance for each 
#' gene to 0 and 1 respectively and subsetting to only genes that are 
#' represeted in each dataset.
homUncorrData <- homogenizeData(allExprData, geneExprMat, batchCorrect="standardize", selection=1, printOutput=TRUE)

#' Fit a glmnet logistic model
library("glmnet")
trainExpr <- t(homUncorrData$train)
trainPtyle <- as.factor(typeVec)
trainDat <- data.frame(trainPtyle, trainExpr)
Fit <- cv.glmnet(trainExpr, trainPtyle, family="multinomial", alpha=0) # alpha = 0 is the ridge penalty.
preDataLRR <- data.frame(t(homUncorrData$test))
Pred <- predict(Fit,t(homUncorrData$test))
# tissuesOrd("blood", "breast", "CNS", "lung", "skin")
classification <- character()
for(i in 1:ncol(homUncorrData$test))
{
  classification[i] <- names(which.max(Pred[i,1:5,1]))
}


#' Show the numbers of things that are correctly and incorrectly classified for each cancer type.
breastCorrect <- sum(classification[cancerTypesVec == "BRCA"] == "breast")
breastIncorrect <- sum(classification[cancerTypesVec == "BRCA"] != "breast")
print(breastCorrect)
print(breastIncorrect)

bloodCorrect <- sum(classification[cancerTypesVec == "LAML"] == "blood")
bloodIncorrect <- sum(classification[cancerTypesVec == "LAML"] != "blood")
print(bloodCorrect)
print(bloodIncorrect)

lungCorrect <- sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] == "lung")
lungIncorrect <- sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] != "lung")
print(lungCorrect)
print(lungIncorrect)

skinCorrect <- sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] == "skin")
skinIncorrect <- sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] != "skin")
print(skinCorrect)
print(skinIncorrect)

cnsCorrect <- sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] == "CNS")
cnsIncorrect <- sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] != "CNS")
print(cnsCorrect)
print(cnsIncorrect)

#' Total number of samples correctly classified
totalCorrect_noBatchCorr <- sum(c(breastCorrect, bloodCorrect, lungCorrect, skinCorrect, cnsCorrect))
print(totalCorrect_noBatchCorr)

#' Total number of samples incorrectly classified
totalIncorrect_noBatchCorr <- sum(c(breastIncorrect, bloodIncorrect, lungIncorrect, skinIncorrect, cnsIncorrect))
print(totalIncorrect_noBatchCorr)

dat <- c((breastCorrect/(breastCorrect+breastIncorrect)), (bloodCorrect/(bloodCorrect+bloodIncorrect)), (lungCorrect/(lungCorrect+lungIncorrect)), (skinCorrect/(skinCorrect+skinIncorrect)), (cnsCorrect/(cnsCorrect+cnsIncorrect)))
labels <- c("Breast", "Blood", "Lung", "Skin", "CNS")

cols <- c("#7fc97f", "#beaed4", "#fdc086", "#ffff99", "#386cb0")
svg(paste(theRootDir, "figures/barplot_noBatchCorrection.svg", sep=""), width=5, height=5.5)
par(mar=c(8.1,4.1,3.1,2.1))
barplot(dat, ylab="Proportion Correctly Classified", col=cols, names.arg=c("Breast (n=1212)", "Blood (n=173)", "Lung (n=1128)", "Skin (n=552)", "CNS (n=696)"), las=2, cex.axis=.7)
dev.off()


#' Compare the performance of this to the batch corrected data.
homUncorrData <- homogenizeData(tenRuvNewStandardApproach, geneExprMat, batchCorrect="standardize", selection=1, printOutput=TRUE)


#' Fit a glmnet logistic model
library("glmnet")
trainExpr <- t(homUncorrData$train)
trainPtyle <- as.factor(typeVec)
trainDat <- data.frame(trainPtyle, trainExpr)
Fit <- cv.glmnet(trainExpr, trainPtyle, family="multinomial", alpha=0) # alpha = 0 is the ridge penalty.
preDataLRR <- data.frame(t(homUncorrData$test))
Pred <- predict(Fit,t(homUncorrData$test))
# tissuesOrd("blood", "breast", "CNS", "lung", "skin")
classification <- character()
for(i in 1:ncol(homUncorrData$test))
{
  classification[i] <- names(which.max(Pred[i,1:5,1]))
}


#' Show the numbers of things that are correctly and incorrectly classified for each cancer type.
breastCorrect <- sum(classification[cancerTypesVec == "BRCA"] == "breast")
breastIncorrect <- sum(classification[cancerTypesVec == "BRCA"] != "breast")
print(breastCorrect)
print(breastIncorrect)

bloodCorrect <- sum(classification[cancerTypesVec == "LAML"] == "blood")
bloodIncorrect <- sum(classification[cancerTypesVec == "LAML"] != "blood")
print(bloodCorrect)
print(bloodIncorrect)

lungCorrect <- sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] == "lung")
lungIncorrect <- sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] != "lung")
print(lungCorrect)
print(lungIncorrect)

skinCorrect <- sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] == "skin")
skinIncorrect <- sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] != "skin")
print(skinCorrect)
print(skinIncorrect)

cnsCorrect <- sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] == "CNS")
cnsIncorrect <- sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] != "CNS")
print(cnsCorrect)
print(cnsIncorrect)

#' Total number of samples correctly classified
totalCorrect_withBatchCorr <- sum(c(breastCorrect, bloodCorrect, lungCorrect, skinCorrect, cnsCorrect))
print(totalCorrect_withBatchCorr)

#' Total number of samples incorrectly classified
totalIncorrect_withBatchCorr <- sum(c(breastIncorrect, bloodIncorrect, lungIncorrect, skinIncorrect, cnsIncorrect))
print(totalIncorrect_withBatchCorr)

dat <- c((breastCorrect/(breastCorrect+breastIncorrect)), (bloodCorrect/(bloodCorrect+bloodIncorrect)), (lungCorrect/(lungCorrect+lungIncorrect)), (skinCorrect/(skinCorrect+skinIncorrect)), (cnsCorrect/(cnsCorrect+cnsIncorrect)))
labels <- c("Breast", "Blood", "Lung", "Skin", "CNS")

cols <- c("#7fc97f", "#beaed4", "#fdc086", "#ffff99", "#386cb0")
svg(paste(theRootDir, "figures/barplot_withBatchCorrection.svg", sep=""), width=5, height=5.5)
par(mar=c(8.1,4.1,3.1,2.1))
barplot(dat, ylab="Proportion Correctly Classified", col=cols, names.arg=c("Breast (n=1212)", "Blood (n=173)", "Lung (n=1128)", "Skin (n=552)", "CNS (n=696)"), las=2, cex.axis=.7)
dev.off()


#' Permute the labels on the training data. We find that everything will be classified as "lung" because there is no signal and lung is the most common cancer type in the training set....
trainExpr <- t(homUncorrData$train)
trainPtyle <- as.factor(sample(typeVec))
trainDat <- data.frame(trainPtyle, trainExpr)
Fit <- cv.glmnet(trainExpr, trainPtyle, family="multinomial", alpha=0) # alpha = 0 is the ridge penalty.
preDataLRR <- data.frame(t(homUncorrData$test))
Pred <- predict(Fit,t(homUncorrData$test))
classification <- character()
for(i in 1:ncol(homUncorrData$test))
{
  classification[i] <- names(which.max(Pred[i,1:5,1]))
}

#' Show the numbers of things that are correctly and incorrectly classified. Everything has been classified as lung.
print(sum(classification[cancerTypesVec == "BRCA"] == "breast"))
print(sum(classification[cancerTypesVec == "BRCA"] != "breast"))

print(sum(classification[cancerTypesVec == "LAML"] == "blood"))
print(sum(classification[cancerTypesVec == "LAML"] != "blood"))
 
print(sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] == "lung"))
print(sum(classification[cancerTypesVec %in% c("LUAD", "LUSC")] != "lung"))

print(sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] == "skin"))
print(sum(classification[cancerTypesVec %in% c("SKCM", "UVM")] != "skin"))

print(sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] == "CNS"))
print(sum(classification[cancerTypesVec %in% c("GBMLGG", "GBM", "LGG")] != "CNS"))

print(sum(classification == "lung"))
print(table(typeVec))


#' lets establish significance by shuffling the labels on the TCGA breast, blood, lung, skin and CNS data. This is highly significant. 
cancerTypeNames
ctypes <- cancerTypesVec[names(cancerTypeNames) %in% c("skin", "breast", "blood", "lung", "CNS")]
theVec <- c(rep("breast", sum(cancerTypesVec == "BRCA")), rep("blood", sum(cancerTypesVec == "LAML")), rep("lung", sum(cancerTypesVec %in% c("LUAD", "LUSC"))), rep("skin", sum(cancerTypesVec %in% c("SKCM", "UVM"))), rep("CNS", sum(cancerTypesVec %in% c("GBMLGG", "GBM", "LGG"))))

sumOut <- numeric()
for(i in 1:100000)
{
  sumOut[i] <- sum(theVec == sample(theVec))
}
m <- mean(sumOut)
theSd <- sd(sumOut)
print(m)
print(theSd)
print(pnorm(sum(c(breastCorrect, bloodCorrect, lungCorrect, skinCorrect, cnsCorrect)), mean=m, sd=theSd, lower.tail=F))


#' Is the change in the number of correctly classified samples significantly improved by our batch correction approach (indicating the batch correction was effective in enriching for biological signal)? Yes it is.
print(fisher.test(matrix(c(totalCorrect_withBatchCorr, totalIncorrect_withBatchCorr, totalCorrect_noBatchCorr, totalIncorrect_noBatchCorr), nrow=2)))




