This code uses RUV based methods (described in PMID:25150836) to batch correct the TCGA gene expression data.

Set the working directory to the location of the data. NB: This must be modifed for your own system

theRootDir <- "/mnt/data_scratch/finalData/"

Create the directory to store the figures.

dir.create(paste(theRootDir, "figures/", sep = ""), showWarnings = FALSE)

Load the pRRophetic library

library("pRRophetic")

A function for getting p-value from linear regression fit in R. Credit source: http://stackoverflow.com/questions/5587676/pull-out-p-values-and-r-squared-from-a-linear-regression

lmp <- function (modelobject) {
    if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
    f <- summary(modelobject)$fstatistic
    p <- pf(f[1],f[2],f[3],lower.tail=F)
    attributes(p) <- NULL
    return(p)
}

Read the directories in which the files are contained. These data were downloaded for firebrowse.org. We provide a script to automate the downloading of these data (see download_tcga_data.R).

theRnaSeqDir <- paste(theRootDir, "dataIn/rnaSeq/", sep="") # the directory containing the RNA-seq data.
theDirs <- dir(theRnaSeqDir)
theDirs <- theDirs[-grep(".tar.gz", theDirs, fixed=T)] # ignore the .tar.gz files.

Some of the TCGA data are redundant, i.e. the same samples are contained in different datasets, we need to remove these duplicated samples.

cancerTypeNames <- sapply(sapply(strsplit(theDirs, ".", fixed=T), function(a)return(strsplit(a[[3]], "_"))), function(b)return(b[2])) #
removeTypes <- c("COADREAD", "GBMLGG", "KIPAN", "STES") ## NB these must not be inlcuded as they are totally redundant, i.e. these samples are identical to those contained in other folders.
theDirsFilt <- theDirs[!cancerTypeNames %in% removeTypes]
cancerTypeNames <- cancerTypeNames[!cancerTypeNames %in% removeTypes]

Load ALL of the data. N.B. This requires a very large amound of memory. It has been tested on a machine with 128Gb or RAM. Note, this code assumes you have obtained the same RNA-seq data as downloaded by the “download_tcga_data.R” script

tpmMatList <- list()
for(i in 1:length(theDirsFilt))
{
  theFile <- dir(paste(theRnaSeqDir, theDirsFilt[i], sep=""))[grep("MANIFEST", dir(paste(theRnaSeqDir, theDirsFilt[i], sep="")), invert=T)]

  tpmDatMat <- read.delim(paste(theRnaSeqDir, theDirsFilt[i], "/", theFile, sep=""), as.is=T)

  tpmDatMat_tpm <- apply(tpmDatMat[-1,which(tpmDatMat[1,] == "scaled_estimate")], 2, as.numeric)
  tpmDatMat_tpm <- tpmDatMat[-1,which(tpmDatMat[1,] == "scaled_estimate")]
  tpmDatMat_tpm <- apply(tpmDatMat_tpm, 2, as.numeric)

  geneNames <- do.call(cbind, strsplit(tpmDatMat[, "Hybridization.REF"], "|", fixed=TRUE))[1,][-1]
  rownames(tpmDatMat_tpm) <- geneNames
  colnames(tpmDatMat_tpm) <- substr(colnames(tpmDatMat_tpm), 1, 28)

  tpmDatMat_tpm_logged <- log((tpmDatMat_tpm*1000000)+1) # transform the data

  tpmMatList[[i]] <- tpmDatMat_tpm_logged

}
rnames <- lapply(tpmMatList, rownames)

Get the cancer types.

names(tpmMatList) <- cancerTypeNames
numSampls <- sapply(tpmMatList, ncol)
cancerTypesVec <- character()
for(i in 1:length(cancerTypeNames)){cancerTypesVec <- c(cancerTypesVec, rep(cancerTypeNames[i], numSampls[i]))}

allExprData <- do.call(cbind, tpmMatList)

Save the gene expression matrix here, we will use this again.

save(allExprData, file=paste(theRootDir, "dataIn/allExprData.RData", sep="")) # allExprData

Now that the data are loaded, we wish to calculate the prinicple components that will be used to “remove unwanted variation” (RUV). First create a matrix of the expression data that is standardized by cancer type, as we do not wish to remove this variabilty.

allCancerTypes <- unique(cancerTypesVec)
standardizeByCancerType <- allExprData
for(i in 1:length(allCancerTypes))
{
  for(j in 1:nrow(allExprData))
  {
    vec <- standardizeByCancerType[j, cancerTypesVec %in% allCancerTypes[i]]
    standardizeByCancerType[j, cancerTypesVec %in% allCancerTypes[i]] <- ((vec-mean(vec))/sd(vec))    
  }
}
save(standardizeByCancerType, file=paste(theRootDir, "dataIn/standardizeByCancerType.RData", sep=""))

Find a set of 250 genes that are expressed in all samples and exhibit the lowest variabilty.

zeroExprSums <- apply(allExprData, 1, function(r)sum(r == 0)) # get the number of samples in which each gene isn't expressed.
consistentlyExpressedGenes <- which(zeroExprSums == 0) # the genes that are expressed in every sample.
varsExprssed <- apply(allExprData[consistentlyExpressedGenes, ], 1, var)
veryLowVarExpressed <- names(sort(varsExprssed)[1:250]) # expressed genes with the lowest variabiltiy
medExprGene <- apply(allExprData, 1, median)

Calculate the principal componets of these genes.

rowNoNas <- which(apply(standardizeByCancerType, 1, function(row)return(sum(is.na(row)))) == 0)
noNasLowVar <- intersect(rownames(standardizeByCancerType[rowNoNas, ]), veryLowVarExpressed)
ruvPcs_standardized <- prcomp(t(standardizeByCancerType[noNasLowVar[-1], ]))

As a sanity check, are these RUV components actually correlated with batch ID, if they are, that is a very good thing, because it shows that these principal components have identified the real batches in a completely unbiased way.

batchIds <- sapply(strsplit(colnames(allExprData), ".", fixed=T), function(l)l[6])
summary(lm(ruvPcs_standardized$x[,1]~factor(batchIds)+factor(cancerTypesVec)))
## 
## Call:
## lm(formula = ruvPcs_standardized$x[, 1] ~ factor(batchIds) + 
##     factor(cancerTypesVec))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.605  -4.525  -0.774   3.649  67.418 
## 
## Coefficients: (6 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 0.73926    2.80662   0.263 0.792248    
## factor(batchIds)0734        3.98220    2.96791   1.342 0.179708    
## factor(batchIds)0735       -3.64563    3.06061  -1.191 0.233625    
## factor(batchIds)0736       -3.36769    3.01498  -1.117 0.264027    
## factor(batchIds)0740       -4.74375    3.18509  -1.489 0.136425    
## factor(batchIds)0744       -4.69347    7.74815  -0.606 0.544692    
## factor(batchIds)0748       -0.55918    3.79430  -0.147 0.882839    
## factor(batchIds)0751       -7.36880    7.74815  -0.951 0.341608    
## factor(batchIds)0760        7.85904    4.07062   1.931 0.053552 .  
## factor(batchIds)0851       -0.39881    1.92265  -0.207 0.835680    
## factor(batchIds)0864        0.62718    2.77765   0.226 0.821365    
## factor(batchIds)0905       -1.17818    7.49732  -0.157 0.875133    
## factor(batchIds)0946        1.46296    2.37136   0.617 0.537296    
## factor(batchIds)0980        0.84730    1.98304   0.427 0.669188    
## factor(batchIds)1100        0.91244    2.68797   0.339 0.734276    
## factor(batchIds)1107        1.30646    2.80735   0.465 0.641675    
## factor(batchIds)1188        2.63291    3.65241   0.721 0.471007    
## factor(batchIds)1193       -0.60435    3.50185  -0.173 0.862984    
## factor(batchIds)1201       -0.72094    2.31975  -0.311 0.755971    
## factor(batchIds)1206        0.65954    2.15171   0.307 0.759216    
## factor(batchIds)1277        4.25383    2.42079   1.757 0.078914 .  
## factor(batchIds)1289        2.83771    2.38546   1.190 0.234238    
## factor(batchIds)1305        5.12186    2.39938   2.135 0.032813 *  
## factor(batchIds)1325        1.62862    2.40440   0.677 0.498200    
## factor(batchIds)1334        5.49288    2.39938   2.289 0.022083 *  
## factor(batchIds)1351       -3.68577    2.96410  -1.243 0.213724    
## factor(batchIds)1410       -1.91816    2.70423  -0.709 0.478145    
## factor(batchIds)1420        2.40926    2.39938   1.004 0.315345    
## factor(batchIds)1426        4.82057    2.40440   2.005 0.045002 *  
## factor(batchIds)1436       -1.26667    2.19549  -0.577 0.563993    
## factor(batchIds)1443        1.28665    2.11889   0.607 0.543713    
## factor(batchIds)1470       -3.29349    2.29245  -1.437 0.150843    
## factor(batchIds)1503       -0.87390    2.33185  -0.375 0.707842    
## factor(batchIds)1514       -1.74507    2.46793  -0.707 0.479521    
## factor(batchIds)1541       -0.39822    2.28346  -0.174 0.861559    
## factor(batchIds)1564        1.08284    3.18509   0.340 0.733886    
## factor(batchIds)1565        0.75995    2.99024   0.254 0.799390    
## factor(batchIds)1566       -1.39987    2.92950  -0.478 0.632764    
## factor(batchIds)1567       -1.15681    2.97678  -0.389 0.697572    
## factor(batchIds)1568       -1.29774    2.93458  -0.442 0.658336    
## factor(batchIds)1569        0.50741    3.12088   0.163 0.870848    
## factor(batchIds)1580       -4.25263    2.07120  -2.053 0.040078 *  
## factor(batchIds)1592        0.56832    2.23155   0.255 0.798980    
## factor(batchIds)1628        1.77448    2.22220   0.799 0.424588    
## factor(batchIds)1635        0.44015    2.08480   0.211 0.832794    
## factor(batchIds)1653       -4.02634    2.28877  -1.759 0.078580 .  
## factor(batchIds)1660       -2.33408    2.95366  -0.790 0.429411    
## factor(batchIds)1672        0.21245    2.34783   0.090 0.927901    
## factor(batchIds)1686       -1.46570    2.20349  -0.665 0.505957    
## factor(batchIds)1708       -2.08487    2.30641  -0.904 0.366046    
## factor(batchIds)1723       -2.13928    2.22892  -0.960 0.337188    
## factor(batchIds)1736       -2.07429    2.73030  -0.760 0.447435    
## factor(batchIds)1755        0.36872    2.06141   0.179 0.858045    
## factor(batchIds)1758        8.40296    2.37094   3.544 0.000396 ***
## factor(batchIds)1766       18.30672    5.54307   3.303 0.000961 ***
## factor(batchIds)1774       -3.57448    2.36370  -1.512 0.130506    
## factor(batchIds)1789        2.75958    2.36836   1.165 0.243971    
## factor(batchIds)1820        1.88157    1.98304   0.949 0.342730    
## factor(batchIds)1830       -0.87982    2.76804  -0.318 0.750606    
## factor(batchIds)1839       -3.36354    2.24857  -1.496 0.134724    
## factor(batchIds)1849       -1.04900    2.92046  -0.359 0.719462    
## factor(batchIds)1850       -1.21892    2.93284  -0.416 0.677704    
## factor(batchIds)1858        2.37621    1.98993   1.194 0.232461    
## factor(batchIds)1873       -1.79530    2.25019  -0.798 0.424980    
## factor(batchIds)1896       -1.23501    2.41403  -0.512 0.608947    
## factor(batchIds)1915        3.21269    2.12743   1.510 0.131044    
## factor(batchIds)1928       -6.16168    2.32038  -2.655 0.007933 ** 
## factor(batchIds)1949        2.06989    1.88075   1.101 0.271112    
## factor(batchIds)1965        1.15758    2.04026   0.567 0.570477    
## factor(batchIds)2005        3.49744    3.40626   1.027 0.304555    
## factor(batchIds)2016        1.95885    2.01714   0.971 0.331522    
## factor(batchIds)2027       -0.43366    2.18191  -0.199 0.842459    
## factor(batchIds)2039       -1.59228    2.17260  -0.733 0.463641    
## factor(batchIds)2045        2.04146    2.00441   1.018 0.308474    
## factor(batchIds)2066       -2.14158    2.21898  -0.965 0.334510    
## factor(batchIds)2081        1.79607    2.22073   0.809 0.418665    
## factor(batchIds)2090       -0.46028    2.30441  -0.200 0.841689    
## factor(batchIds)2118       -0.68481    2.04732  -0.334 0.738016    
## factor(batchIds)2125        0.23771    1.94617   0.122 0.902787    
## factor(batchIds)2132       -2.64760    2.37824  -1.113 0.265625    
## factor(batchIds)2139       -3.06189    2.81364  -1.088 0.276521    
## factor(batchIds)2156       -0.19170    2.41670  -0.079 0.936776    
## factor(batchIds)2170        2.14276    2.34743   0.913 0.361366    
## factor(batchIds)2187       -3.03194    2.15351  -1.408 0.159191    
## factor(batchIds)2204       -2.32925    2.55178  -0.913 0.361373    
## factor(batchIds)2213        5.15920    3.50607   1.472 0.141187    
## factor(batchIds)2232       -5.16590    2.46793  -2.093 0.036356 *  
## factor(batchIds)2241       -2.25301    2.15171  -1.047 0.295090    
## factor(batchIds)2247       -0.80170    1.94617  -0.412 0.680392    
## factor(batchIds)2256       -2.33235    2.21486  -1.053 0.292345    
## factor(batchIds)2263       -0.81667    2.35135  -0.347 0.728358    
## factor(batchIds)2287       -1.48645    2.49153  -0.597 0.550789    
## factor(batchIds)2296        0.13236    2.22875   0.059 0.952643    
## factor(batchIds)2315        9.25186    7.49587   1.234 0.217136    
## factor(batchIds)2326       -1.24585    2.08928  -0.596 0.550983    
## factor(batchIds)2347       -0.20800    7.45745  -0.028 0.977749    
## factor(batchIds)2403       -1.08750    1.85793  -0.585 0.558338    
## factor(batchIds)2404        0.26022    2.11804   0.123 0.902220    
## factor(batchIds)A00Z       -2.20523    1.99401  -1.106 0.268785    
## factor(batchIds)A034        1.08827    1.98825   0.547 0.584151    
## factor(batchIds)A056        0.27089    2.12126   0.128 0.898388    
## factor(batchIds)A083       -1.92732    4.63019  -0.416 0.677236    
## factor(batchIds)A084       -0.59944    2.11953  -0.283 0.777322    
## factor(batchIds)A089        2.46210    2.42357   1.016 0.309704    
## factor(batchIds)A104       -4.38144    5.45736  -0.803 0.422081    
## factor(batchIds)A109       -3.69855    2.17425  -1.701 0.088962 .  
## factor(batchIds)A10J       -0.26393    2.32294  -0.114 0.909543    
## factor(batchIds)A10U        2.18090    2.13238   1.023 0.306448    
## factor(batchIds)A115       -2.25047    1.98921  -1.131 0.257939    
## factor(batchIds)A118        8.00579    5.45736   1.467 0.142416    
## factor(batchIds)A12D       -1.81045    2.06381  -0.877 0.380380    
## factor(batchIds)A12P       -0.79974    2.07106  -0.386 0.699392    
## factor(batchIds)A131        2.12641    2.08536   1.020 0.307904    
## factor(batchIds)A137       -0.57769    2.31825  -0.249 0.803219    
## factor(batchIds)A13Q       -0.47826    1.99738  -0.239 0.810765    
## factor(batchIds)A13S       -0.47515    4.59238  -0.103 0.917596    
## factor(batchIds)A13Y       -2.90263    2.11249  -1.374 0.169462    
## factor(batchIds)A144       -1.05177    2.12694  -0.495 0.620963    
## factor(batchIds)A14D       -0.93754    2.24910  -0.417 0.676795    
## factor(batchIds)A14M       -0.33532    2.20912  -0.152 0.879357    
## factor(batchIds)A14Y        0.26285    2.09836   0.125 0.900316    
## factor(batchIds)A155        0.71804    2.36374   0.304 0.761307    
## factor(batchIds)A157        1.62664    2.17962   0.746 0.455508    
## factor(batchIds)A169       -1.13819    2.11056  -0.539 0.589704    
## factor(batchIds)A16F       -0.47759    2.19368  -0.218 0.827658    
## factor(batchIds)A16R        1.53046    2.26000   0.677 0.498299    
## factor(batchIds)A16W       -0.83529    2.52156  -0.331 0.740457    
## factor(batchIds)A17B       -0.51891    2.93355  -0.177 0.859600    
## factor(batchIds)A180       -0.04023    2.00702  -0.020 0.984009    
## factor(batchIds)A18C       -3.27862    2.23604  -1.466 0.142608    
## factor(batchIds)A18M        1.20246    2.10719   0.571 0.568252    
## factor(batchIds)A18S       -4.56940    2.00545  -2.278 0.022719 *  
## factor(batchIds)A18T       -3.86305    2.00250  -1.929 0.053746 .  
## factor(batchIds)A18U       -3.70030    2.28025  -1.623 0.104673    
## factor(batchIds)A19E        3.94681    7.45297   0.530 0.596427    
## factor(batchIds)A19O       -2.29806    2.11445  -1.087 0.277136    
## factor(batchIds)A19W       -0.78425    2.19953  -0.357 0.721433    
## factor(batchIds)A206        1.82135    2.17909   0.836 0.403271    
## factor(batchIds)A20F       -2.70013    2.03945  -1.324 0.185551    
## factor(batchIds)A213        1.52621    1.98811   0.768 0.442703    
## factor(batchIds)A21D       -1.25177    2.09268  -0.598 0.549741    
## factor(batchIds)A21T        1.37275    2.11523   0.649 0.516363    
## factor(batchIds)A220       -0.19902    2.17000  -0.092 0.926927    
## factor(batchIds)A22K       -1.45471    2.02252  -0.719 0.471998    
## factor(batchIds)A22L        0.20190    2.04744   0.099 0.921450    
## factor(batchIds)A22U       -1.26017    2.02201  -0.623 0.533151    
## factor(batchIds)A239       -1.59850    2.19079  -0.730 0.465623    
## factor(batchIds)A23N        0.49264    2.03892   0.242 0.809082    
## factor(batchIds)A23W        1.12381    2.20901   0.509 0.610946    
## factor(batchIds)A24H        0.35351    1.87542   0.188 0.850491    
## factor(batchIds)A24K       -2.44527    4.62295  -0.529 0.596858    
## factor(batchIds)A24X       -1.92516    1.94537  -0.990 0.322389    
## factor(batchIds)A24Z       -0.72651    2.09750  -0.346 0.729070    
## factor(batchIds)A250        1.76513    2.06199   0.856 0.392001    
## factor(batchIds)A260        0.23872    4.51422   0.053 0.957828    
## factor(batchIds)A262       -0.65534    1.97785  -0.331 0.740395    
## factor(batchIds)A266       -0.86241    1.95918  -0.440 0.659811    
## factor(batchIds)A26B       14.92300    5.42218   2.752 0.005930 ** 
## factor(batchIds)A26T        0.99826    2.14309   0.466 0.641366    
## factor(batchIds)A26U       -1.41654    2.05964  -0.688 0.491620    
## factor(batchIds)A26W       -0.68384    1.88951  -0.362 0.717425    
## factor(batchIds)A27Q        0.63717    1.86256   0.342 0.732289    
## factor(batchIds)A27V       -0.60864    2.23730  -0.272 0.785596    
## factor(batchIds)A28H       -2.19949    2.09191  -1.051 0.293089    
## factor(batchIds)A28J       -3.01794    4.78540  -0.631 0.528280    
## factor(batchIds)A28M       -1.59889    2.03128  -0.787 0.431223    
## factor(batchIds)A28V       -0.97177    2.16138  -0.450 0.653006    
## factor(batchIds)A29R       -2.05581    2.00604  -1.025 0.305479    
## factor(batchIds)A29S       -0.73926    2.92187  -0.253 0.800266    
## factor(batchIds)A30B        1.55558    2.10632   0.739 0.460210    
## factor(batchIds)A30C       -0.60505    2.14070  -0.283 0.777456    
## factor(batchIds)A311       -1.00231    2.10043  -0.477 0.633236    
## factor(batchIds)A31N        1.49718    1.99488   0.751 0.452965    
## factor(batchIds)A31O       -1.02893    2.17445  -0.473 0.636088    
## factor(batchIds)A31P       -3.45874    4.90955  -0.704 0.481143    
## factor(batchIds)A32O       -2.62164    2.02831  -1.293 0.196206    
## factor(batchIds)A32P       -2.46539    2.10234  -1.173 0.240949    
## factor(batchIds)A32Q        0.66357    2.00625   0.331 0.740840    
## factor(batchIds)A32Y        2.70046    2.15338   1.254 0.209852    
## factor(batchIds)A32Z       -0.05548    2.17079  -0.026 0.979609    
## factor(batchIds)A336       -1.25598    4.61072  -0.272 0.785317    
## factor(batchIds)A33J       -3.38984    2.02493  -1.674 0.094153 .  
## factor(batchIds)A33R       -3.40293    2.12742  -1.600 0.109731    
## factor(batchIds)A33Z       -5.07581    2.04095  -2.487 0.012900 *  
## factor(batchIds)A34F       -2.17076    2.13293  -1.018 0.308828    
## factor(batchIds)A34R       -2.82843    1.96401  -1.440 0.149863    
## factor(batchIds)A352       -1.91821    1.96360  -0.977 0.328651    
## factor(batchIds)A354       -3.41220    4.90955  -0.695 0.487063    
## factor(batchIds)A355       -1.72896    2.27115  -0.761 0.446514    
## factor(batchIds)A35K        0.86586    2.90379   0.298 0.765571    
## factor(batchIds)A35L       -2.36164    2.90482  -0.813 0.416232    
## factor(batchIds)A36D       -4.97088    4.57977  -1.085 0.277771    
## factor(batchIds)A36F       -1.20903    1.97267  -0.613 0.539963    
## factor(batchIds)A36G       -1.19658    2.08741  -0.573 0.566498    
## factor(batchIds)A36H       -2.48192    2.08712  -1.189 0.234404    
## factor(batchIds)A37I       -4.52811    4.49726  -1.007 0.314027    
## factor(batchIds)A37K       -3.05175    1.96226  -1.555 0.119926    
## factor(batchIds)A37L       -1.81767    1.98794  -0.914 0.360556    
## factor(batchIds)A37O       -1.96109    2.09034  -0.938 0.348182    
## factor(batchIds)A38B       -2.42050    1.97811  -1.224 0.221117    
## factor(batchIds)A38C       -1.09035    2.00870  -0.543 0.587270    
## factor(batchIds)A38D       -5.81476    4.86102  -1.196 0.231647    
## factor(batchIds)A39D       -1.41845    1.94820  -0.728 0.466580    
## factor(batchIds)A39I       -3.40542    2.01265  -1.692 0.090677 .  
## factor(batchIds)A405       -0.73926    2.68797  -0.275 0.783302    
## factor(batchIds)A406       -5.14032    3.69761  -1.390 0.164508    
## factor(batchIds)A40A       -2.74683    2.23809  -1.227 0.219736    
## factor(batchIds)A41B       -1.37837    1.95529  -0.705 0.480864    
## factor(batchIds)A41C       -1.00200    2.02845  -0.494 0.621336    
## factor(batchIds)A41I       -1.21978    2.42938  -0.502 0.615612    
## factor(batchIds)A41O       -1.82473    2.05613  -0.887 0.374852    
## factor(batchIds)A42C        0.79491    2.56047   0.310 0.756221    
## factor(batchIds)A42S        0.09915    2.11129   0.047 0.962546    
## factor(batchIds)A42T       -2.80449    1.99522  -1.406 0.159873    
## factor(batchIds)A430       -1.65349    2.90379  -0.569 0.569080    
## factor(batchIds)A431        0.64683    2.95269   0.219 0.826605    
## factor(batchIds)A466       10.65288    2.89303   3.682 0.000232 ***
## factor(cancerTypesVec)BLCA  0.76095    2.52594   0.301 0.763226    
## factor(cancerTypesVec)BRCA  0.03417    2.50995   0.014 0.989139    
## factor(cancerTypesVec)CESC  0.27004    2.53059   0.107 0.915021    
## factor(cancerTypesVec)CHOL  0.48052    3.15765   0.152 0.879052    
## factor(cancerTypesVec)COAD  1.90028    2.63873   0.720 0.471451    
## factor(cancerTypesVec)DLBC -3.51345    3.48344  -1.009 0.313183    
## factor(cancerTypesVec)ESCA  2.29545    3.31826   0.692 0.489103    
## factor(cancerTypesVec)GBM        NA         NA      NA       NA    
## factor(cancerTypesVec)HNSC -0.52527    2.52837  -0.208 0.835429    
## factor(cancerTypesVec)KICH -9.87751    7.64973  -1.291 0.196657    
## factor(cancerTypesVec)KIRC -2.66770    2.74902  -0.970 0.331863    
## factor(cancerTypesVec)KIRP  0.03291    2.54559   0.013 0.989686    
## factor(cancerTypesVec)LAML       NA         NA      NA       NA    
## factor(cancerTypesVec)LGG   0.85681    2.52322   0.340 0.734186    
## factor(cancerTypesVec)LIHC -0.08793    2.53052  -0.035 0.972283    
## factor(cancerTypesVec)LUAD -0.94167    2.45957  -0.383 0.701832    
## factor(cancerTypesVec)LUSC -1.00369    2.32239  -0.432 0.665618    
## factor(cancerTypesVec)MESO  1.75972    2.81392   0.625 0.531748    
## factor(cancerTypesVec)OV         NA         NA      NA       NA    
## factor(cancerTypesVec)PAAD  0.65451    2.56458   0.255 0.798565    
## factor(cancerTypesVec)PCPG       NA         NA      NA       NA    
## factor(cancerTypesVec)PRAD  0.57263    2.53627   0.226 0.821379    
## factor(cancerTypesVec)READ  1.22948    2.92675   0.420 0.674432    
## factor(cancerTypesVec)SARC  0.15817    2.52394   0.063 0.950034    
## factor(cancerTypesVec)SKCM  2.10772    2.52405   0.835 0.403706    
## factor(cancerTypesVec)TGCT       NA         NA      NA       NA    
## factor(cancerTypesVec)THCA -0.60615    2.55454  -0.237 0.812443    
## factor(cancerTypesVec)THYM -1.43248    2.97990  -0.481 0.630730    
## factor(cancerTypesVec)UCEC -0.24655    2.57182  -0.096 0.923628    
## factor(cancerTypesVec)UCS   0.23250    2.91486   0.080 0.936426    
## factor(cancerTypesVec)UVM        NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.222 on 9726 degrees of freedom
## Multiple R-squared:  0.05976,    Adjusted R-squared:  0.03647 
## F-statistic: 2.565 on 241 and 9726 DF,  p-value: < 2.2e-16
theRsquareds <- numeric()
thePvals <- numeric()
for(i in 1:100)
{
  theMod <- lm(ruvPcs_standardized$x[,i]~factor(batchIds))
  theRsquareds[i] <- summary(theMod)$r.squared
  thePvals[i] <- lmp(theMod)
}
print(theRsquareds[1:10])
##  [1] 0.05576464 0.22312837 0.17869422 0.11473238 0.06189570 0.04170722
##  [7] 0.04977357 0.06469582 0.05487097 0.05170802
print(thePvals[1:10])
##  [1]  4.025558e-33  0.000000e+00 1.701332e-274 2.945888e-136  4.294664e-42
##  [6]  3.147887e-15  5.609054e-25  2.236910e-46  7.269734e-32  1.562697e-27

Supplementary Figure: the proportion of variabilty in “batch” caputred by each of the RUV principal components. This clearly levels off after 10, suggesting that 10 PCs, the number proposed by the original authors, is appropriate in this case.

pdf(paste(theRootDir, "figures/theRsquareds_ruv_components_against_batch.pdf", sep=""), width=4, height=4)
plot(theRsquareds, pch=20, ylab="R squared", xlab="Principal Component")
abline(v=10, col="red")
dev.off()
## png 
##   2

finally, for each gene, regress out the 10 RUV PCs that we have calculated on the matrix of standardized expression data, then save this updated matrx.

tenRuvNewStandardApproach <- allExprData
for(i in 1:nrow(allExprData))
{
  tenRuvNewStandardApproach[i,] <- residuals(glm(allExprData[i,]~ruvPcs_standardized$x[, 1:10], family="quasipoisson"))
}
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge
save(tenRuvNewStandardApproach, cancerTypesVec, file=paste(theRootDir, "dataIn/tenRuvNewStandardApproach.RData", sep=""))