#' This script will download all of the TCGA data that we have used in this project. This data is obtained from the firebrowse.org repository.


#' To run from the Bionimbus PDC, first run the following commands from the command line to set up the HTTP proxy:
#' export http_proxy=http://cloud-proxy:3128
#' export https_proxy=http://cloud-proxy:3128

#' Set the root directory where the data will be stored. NB: this directory needs to be set / created based on your own system!!
theRootDir <- "/mnt/data_scratch/finalData/"

#' create the "dataIn/" directory if it doesn't already exist.
dir.create(paste(theRootDir, "dataIn/", sep=""), showWarnings = FALSE)


#' The data is organized on firebrowse by cancer type. They use these disease abbreviations to access the different folders containing the various data.
diseaseAbbrvs <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA", "FPPP", "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "STES", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")


#' Download all the TCGA RNA-seq data.
missingAbrvsRnaSeq <- c(10, 31) # there is no RNA-seq data for "FPPP" or "STAD"
rnaSeqDiseaseAbbrvs <- diseaseAbbrvs[-missingAbrvsRnaSeq]
rnaSeqFilesDir <- paste(theRootDir, "dataIn/rnaSeq/", sep="")
dir.create(rnaSeqFilesDir, showWarnings = FALSE) # make this directory if it doesn't exist.
for(i in 1:length(rnaSeqDiseaseAbbrvs))
{
  fname <- paste("gdac.broadinstitute.org_", rnaSeqDiseaseAbbrvs[i], ".Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.Level_3.2015082100.0.0.tar.gz", sep="")
  download.file(paste("http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/", rnaSeqDiseaseAbbrvs[i], "/20150821/gdac.broadinstitute.org_", rnaSeqDiseaseAbbrvs[i], ".Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.Level_3.2015082100.0.0.tar.gz", sep=""), paste(rnaSeqFilesDir, fname, sep=""))
}

# Unzip the downloaded ".tar.gz" RNA-seq data! NB, this command has been tested in Linux. It may not work in Windows. If it does not work, please extract these files manually using software such as 7zip.
thegzFiles <-  paste(rnaSeqFilesDir, dir(rnaSeqFilesDir), sep="")
sapply(thegzFiles, untar, exdir=rnaSeqFilesDir)


#' Download the TCGA clinical data.
clinicalFilesDir <- paste(theRootDir, "dataIn/clinical/", sep="")
dir.create(clinicalFilesDir, showWarnings = FALSE) # make this directory if it doesn't exist.
#' Note: If you have problems directly accessing these data from FireBrowse, please use this code (clinical data is mirrored on github), code to access clinical data from FireBrowse has been commented out because we were having issues with accessing this data right before re-submitting this manuscript, thus we have mirrored these data at the link below.
allTcgaClinAbrvs <- c("acc", "blca", "brca", "cesc", "chol", "cntl", "coad", "dlbc", "esca", "fppp", "gbm", "hnsc", "kich", "kirc", "kirp", "laml", "lcml", "lgg", "lihc", "lnnh", "luad", "lusc", "meso", "misc", "ov", "paad", "pcpg", "prad", "read", "sarc", "skcm", "stad", "tgct", "thca", "thym", "ucec", "ucs", "uvm")
for(i in 1:length(allTcgaClinAbrvs))
{
  fname <- paste("nationwidechildrens.org_clinical_patient_", allTcgaClinAbrvs[i], ".txt", sep="")
  theUrl <- paste("https://raw.github.com/paulgeeleher/tcgaData/master/nationwidechildrens.org_clinical_patient_", allTcgaClinAbrvs[i], ".txt", sep="")
  download.file(theUrl, paste(clinicalFilesDir, fname, sep=""))
}
# for(i in 1:length(allTcgaClinAbrvs))
# {
#   fname <- paste("nationwidechildrens.org_clinical_patient_", allTcgaClinAbrvs[i], ".txt", sep="")
#   theUrl <- paste("https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor/", allTcgaClinAbrvs[i] ,"/bcr/biotab/clin/nationwidechildrens.org_clinical_patient_", allTcgaClinAbrvs[i], ".txt", sep="")
#   download.file(theUrl, paste(clinicalFilesDir, fname, sep=""))
# }

#' Download the level 3 CNV data. Note that these data have had the germline component removed, thus these are somatic CNVs.
diseaseAbbrvsForCnvs <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA", "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "STES", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")
cnvFilesDir <- paste(theRootDir, "dataIn/tcga_cnv_subtracted/", sep="")
dir.create(cnvFilesDir, showWarnings = FALSE) # make this directory if it doesn't exist.
for(i in 1:length(diseaseAbbrvsForCnvs))
{
  download.file(paste("http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/", diseaseAbbrvsForCnvs[i],"/20150821/gdac.broadinstitute.org_", diseaseAbbrvsForCnvs[i],".Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015082100.0.0.tar.gz", sep=""), paste(cnvFilesDir, "gdac.broadinstitute.org_", diseaseAbbrvsForCnvs[i],".Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015082100.0.0.tar.gz", sep=""))
}
thegzFiles <-  paste(cnvFilesDir, dir(cnvFilesDir), sep="")
sapply(thegzFiles, untar, exdir=cnvFilesDir)


#' Download the somatic mutation data.
diseaseAbbrvsForMuts <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA", "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "STES", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")
mutFilesDir <- paste(theRootDir, "dataIn/mutation_data/", sep="")
dir.create(mutFilesDir, showWarnings = FALSE) # make this directory if it doesn't exist.
for(i in 1:length(diseaseAbbrvsForMuts))
{
  mutationDataUrl <- paste("http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/", diseaseAbbrvsForMuts[i], "/20160128/gdac.broadinstitute.org_", diseaseAbbrvsForMuts[i],".Mutation_Packager_Calls.Level_3.2016012800.0.0.tar.gz", sep="")
  fname <- paste("gdac.broadinstitute.org_", diseaseAbbrvsForMuts[i],".Mutation_Packager_Calls.Level_3.2016012800.0.0.tar.gz", sep="")
  download.file(mutationDataUrl, paste(mutFilesDir, fname, sep=""))
}
thegzFiles <-  paste(mutFilesDir, dir(mutFilesDir), sep="")
sapply(thegzFiles, untar, exdir=mutFilesDir)


#' There is additional clinical information in this file, that we need to add some drug information to one of the supplementary tables (reviewer request). Used in "breast_cancer_analysis.R".
thisUrl <- "http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/BRCA/20160128/gdac.broadinstitute.org_BRCA.Merge_Clinical.Level_1.2016012800.0.0.tar.gz"
thisFname <- "gdac.broadinstitute.org_BRCA.Merge_Clinical.Level_1.2016012800.0.0.tar.gz"
download.file(thisUrl, paste(clinicalFilesDir, thisFname, sep=""))
untar(paste(clinicalFilesDir, thisFname, sep=""), exdir=clinicalFilesDir)

#' Download the "categories.csv" file. This was definited in Geeleher et al, Genome Biology 2016. Here it is downloaded from GitHub.
theUrl <- paste("https://raw.github.com/paulgeeleher/tcgaData/master/categorys.csv", sep="")
download.file(theUrl, paste(theRootDir, "dataIn/categorys.csv", sep=""))

#' Download the cell_lines_copy_number.csv, which is the CGP CNV file. Here it is downloaded from GitHub (originally obtained from CGP/GDSC website: www.cancerrxgene.org).
theUrl <- paste("https://raw.github.com/paulgeeleher/tcgaData/master/cgp_cnv_data/cell_lines_copy_number.csv", sep="")
download.file(theUrl, paste(theRootDir, "dataIn/cell_lines_copy_number.csv", sep=""))


#' Print the sessiion to aid reproducibilty.
print(sessionInfo())



