#' This script will do leave-one-out cross-validation for every drug in CGP and assess how well its sensitivity can be predicted from gene expression data alone in CGP.

#' Set the root directory to the location of the data. This must be modifed for your own use (based on the location of the files).
theRootDir <- "/mnt/data_scratch/finalData/"
dir.create(paste(theRootDir, "tables/", sep=""), showWarnings = FALSE)

#' Number of cores to use in this analysis. NB: Make sure to tailor this to your machine.
ncores <- 8

#' Load libraries
library("pRRophetic")
library("parallel")

#' Get the number of samples for which we have data for each drug. We need this as this analysis will not be meaningful with very small numbers of samples (although these drugs are rare).
possibleDrugs <- c("A.443654", "A.770041", "ABT.263", "ABT.888", "AG.014699", "AICAR", "AKT.inhibitor.VIII", "AMG.706", "AP.24534", "AS601245", "ATRA", "AUY922", "Axitinib", "AZ628", "AZD.0530", "AZD.2281", "AZD6244", "AZD6482", "AZD7762", "AZD8055", "BAY.61.3606", "Bexarotene", "BI.2536", "BIBW2992", "Bicalutamide", "BI.D1870", "BIRB.0796", "Bleomycin", "BMS.509744", "BMS.536924", "BMS.708163", "BMS.754807", "Bortezomib", "Bosutinib", "Bryostatin.1", "BX.795", "Camptothecin", "CCT007093", "CCT018159", "CEP.701", "CGP.082996", "CGP.60474", "CHIR.99021", "CI.1040", "Cisplatin", "CMK", "Cyclopamine", "Cytarabine", "Dasatinib", "DMOG", "Docetaxel", "Doxorubicin", "EHT.1864", "Elesclomol", "Embelin", "Epothilone.B", "Erlotinib", "Etoposide", "FH535", "FTI.277", "GDC.0449", "GDC0941", "Gefitinib", "Gemcitabine", "GNF.2", "GSK269962A", "GSK.650394", "GW.441756", "GW843682X", "Imatinib", "IPA.3", "JNJ.26854165", "JNK.9L", "JNK.Inhibitor.VIII", "JW.7.52.1", "KIN001.135", "KU.55933", "Lapatinib", "Lenalidomide", "LFM.A13", "Metformin", "Methotrexate", "MG.132", "Midostaurin", "Mitomycin.C", "MK.2206", "MS.275", "Nilotinib", "NSC.87877", "NU.7441", "Nutlin.3a", "NVP.BEZ235", "NVP.TAE684", "Obatoclax.Mesylate", "OSI.906", "PAC.1", "Paclitaxel", "Parthenolide", "Pazopanib", "PD.0325901", "PD.0332991", "PD.173074", "PF.02341066", "PF.4708671", "PF.562271", "PHA.665752", "PLX4720", "Pyrimethamine", "QS11", "Rapamycin", "RDEA119", "RO.3306", "Roscovitine", "Salubrinal", "SB.216763", "SB590885", "Shikonin", "SL.0101.1", "Sorafenib", "S.Trityl.L.cysteine", "Sunitinib", "Temsirolimus", "Thapsigargin", "Tipifarnib", "TW.37", "Vinblastine", "Vinorelbine", "Vorinostat", "VX.680", "VX.702", "WH.4.023", "WO2009093972", "WZ.1.84", "X17.AAG", "X681640", "XMD8.85", "Z.LLNle.CHO", "ZM.447439")
theLens <- numeric()
theLensBlood <- numeric()
cgpBloodSampsList <- list()
for(i in 1:138)
{
  cgpTrainData <- getCGPinfo(possibleDrugs[i], "all") # get the IC50 and expression data for this drug/tissueType
  theLens[i] <- length(cgpTrainData$ic50sOrd)
  cgpTrainDataBlood <- getCGPinfo(possibleDrugs[i], "blood") # get the IC50 and expression data for this drug/tissueType
  theLensBlood[i] <- length(cgpTrainDataBlood$ic50sOrd)
  cgpBloodSampsList[[i]] <- names(cgpTrainDataBlood[[1]])
}
names(theLens) <- possibleDrugs
names(theLensBlood) <- possibleDrugs
names(cgpBloodSampsList) <- possibleDrugs


#' A function for the cross validation analysis.
doCv <- function(num, tissueType, cvFold, minNumSamples)
{
  # print(num)
  if(theLens[num] > 40)
  {
    return(pRRopheticCV(possibleDrugs[num], tissueType=tissueType, cvFold=cvFold, minNumSamples=minNumSamples))
  }
  else(return(NULL))
}

#' Do the cross validation analysis, but train only on blood. I am interested in whether this improves performance for some drugs.
loocvOut_all <- mclapply(1:138, doCv, tissueType="all", cvFold=10, minNumSamples=0, mc.cores=ncores)
names(loocvOut_all) <- possibleDrugs

#' Print the LOOCV values for all drugs...
corP <- numeric()
corSpear <- numeric()
corP_solid <- numeric()
corSpear_solid <- numeric()
corP_blood <- numeric()
corSpear_blood <- numeric()
nAll <- numeric()
nBlood <- numeric()
nSolid <- numeric()
for(i in 1:length(loocvOut_all))
{
  bloodSamps <- which(names(loocvOut_all[[i]][[2]]) %in% cgpBloodSampsList[[1]])
  solidSamps <- which(!names(loocvOut_all[[i]][[2]]) %in% cgpBloodSampsList[[1]])

  if((!is.null(loocvOut_all[[i]][[1]])))
  {
    # correlations on all tumors.
    theCor <- cor.test(loocvOut_all[[i]][[1]], loocvOut_all[[i]][[2]])
    corP[i] <- theCor$p.value
    corSpear[i] <- theCor$estimate
    nAll[i] <- length(loocvOut_all[[i]][[1]])
    
    # correlations on solid tumors.
    theCor_solid <- cor.test(loocvOut_all[[i]][[1]][solidSamps], loocvOut_all[[i]][[2]][solidSamps])
    corP_solid[i] <- theCor_solid$p.value
    corSpear_solid[i] <- theCor_solid$estimate
    nSolid[i] <- length(loocvOut_all[[i]][[1]][solidSamps])
    
    # correlations on blood tumors.
    if(length(loocvOut_all[[i]][[2]][bloodSamps]) > 1)
    {
      theCor_blood <- cor.test(loocvOut_all[[i]][[1]][bloodSamps], loocvOut_all[[i]][[2]][bloodSamps])
      corP_blood[i] <- theCor_blood$p.value
      corSpear_blood[i] <- theCor_blood$estimate
      nBlood[i] <- length(loocvOut_all[[i]][[1]][bloodSamps])
    }
  }
}
names(corP) <- possibleDrugs
names(corSpear) <- possibleDrugs
names(corP_solid) <- possibleDrugs
names(corSpear_solid) <- possibleDrugs
names(corP_blood) <- possibleDrugs
names(corSpear_blood) <- possibleDrugs

#' Create the table and write to file. Included as supplementary table 5.
m <- cbind(nAll, corSpear, corP, nSolid, corSpear_solid, corP_solid, nBlood, corSpear_blood, corP_blood)
colnames(m) <- c("N All", "Spearman Correlation All", "Spearman P-value All", "N Solid Tumors", "Spearman Correlation Solid", "Spearman P-value Solid", "N Blood", "Spearman Correlation Blood", "Spearman P-value Blood")
write.csv(m, paste(theRootDir, file="tables/SuppTabPredictabiltiy.csv", sep=""))




