Set the working directory to the location of the data. NB: This must be modifed for your own system
theRootDir <- "/mnt/data_scratch/finalData/"
Create the directory to store the figures.
dir.create(paste(theRootDir, "figures/", sep = ""), showWarnings = FALSE)
Load the pRRophetic library
library("pRRophetic")
A function for getting p-value from linear regression fit in R. Credit source: http://stackoverflow.com/questions/5587676/pull-out-p-values-and-r-squared-from-a-linear-regression
lmp <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
Read the directories in which the files are contained. These data were downloaded for firebrowse.org. We provide a script to automate the downloading of these data (see download_tcga_data.R).
theRnaSeqDir <- paste(theRootDir, "dataIn/rnaSeq/", sep="") # the directory containing the RNA-seq data.
theDirs <- dir(theRnaSeqDir)
theDirs <- theDirs[-grep(".tar.gz", theDirs, fixed=T)] # ignore the .tar.gz files.
Some of the TCGA data are redundant, i.e. the same samples are contained in different datasets, we need to remove these duplicated samples.
cancerTypeNames <- sapply(sapply(strsplit(theDirs, ".", fixed=T), function(a)return(strsplit(a[[3]], "_"))), function(b)return(b[2])) #
removeTypes <- c("COADREAD", "GBMLGG", "KIPAN", "STES") ## NB these must not be inlcuded as they are totally redundant, i.e. these samples are identical to those contained in other folders.
theDirsFilt <- theDirs[!cancerTypeNames %in% removeTypes]
cancerTypeNames <- cancerTypeNames[!cancerTypeNames %in% removeTypes]
Load ALL of the data. N.B. This requires a very large amound of memory. It has been tested on a machine with 128Gb or RAM. Note, this code assumes you have obtained the same RNA-seq data as downloaded by the “download_tcga_data.R” script
tpmMatList <- list()
for(i in 1:length(theDirsFilt))
{
theFile <- dir(paste(theRnaSeqDir, theDirsFilt[i], sep=""))[grep("MANIFEST", dir(paste(theRnaSeqDir, theDirsFilt[i], sep="")), invert=T)]
tpmDatMat <- read.delim(paste(theRnaSeqDir, theDirsFilt[i], "/", theFile, sep=""), as.is=T)
tpmDatMat_tpm <- apply(tpmDatMat[-1,which(tpmDatMat[1,] == "scaled_estimate")], 2, as.numeric)
tpmDatMat_tpm <- tpmDatMat[-1,which(tpmDatMat[1,] == "scaled_estimate")]
tpmDatMat_tpm <- apply(tpmDatMat_tpm, 2, as.numeric)
geneNames <- do.call(cbind, strsplit(tpmDatMat[, "Hybridization.REF"], "|", fixed=TRUE))[1,][-1]
rownames(tpmDatMat_tpm) <- geneNames
colnames(tpmDatMat_tpm) <- substr(colnames(tpmDatMat_tpm), 1, 28)
tpmDatMat_tpm_logged <- log((tpmDatMat_tpm*1000000)+1) # transform the data
tpmMatList[[i]] <- tpmDatMat_tpm_logged
}
rnames <- lapply(tpmMatList, rownames)
Get the cancer types.
names(tpmMatList) <- cancerTypeNames
numSampls <- sapply(tpmMatList, ncol)
cancerTypesVec <- character()
for(i in 1:length(cancerTypeNames)){cancerTypesVec <- c(cancerTypesVec, rep(cancerTypeNames[i], numSampls[i]))}
allExprData <- do.call(cbind, tpmMatList)
Save the gene expression matrix here, we will use this again.
save(allExprData, file=paste(theRootDir, "dataIn/allExprData.RData", sep="")) # allExprData
Now that the data are loaded, we wish to calculate the prinicple components that will be used to “remove unwanted variation” (RUV). First create a matrix of the expression data that is standardized by cancer type, as we do not wish to remove this variabilty.
allCancerTypes <- unique(cancerTypesVec)
standardizeByCancerType <- allExprData
for(i in 1:length(allCancerTypes))
{
for(j in 1:nrow(allExprData))
{
vec <- standardizeByCancerType[j, cancerTypesVec %in% allCancerTypes[i]]
standardizeByCancerType[j, cancerTypesVec %in% allCancerTypes[i]] <- ((vec-mean(vec))/sd(vec))
}
}
save(standardizeByCancerType, file=paste(theRootDir, "dataIn/standardizeByCancerType.RData", sep=""))
Find a set of 250 genes that are expressed in all samples and exhibit the lowest variabilty.
zeroExprSums <- apply(allExprData, 1, function(r)sum(r == 0)) # get the number of samples in which each gene isn't expressed.
consistentlyExpressedGenes <- which(zeroExprSums == 0) # the genes that are expressed in every sample.
varsExprssed <- apply(allExprData[consistentlyExpressedGenes, ], 1, var)
veryLowVarExpressed <- names(sort(varsExprssed)[1:250]) # expressed genes with the lowest variabiltiy
medExprGene <- apply(allExprData, 1, median)
Calculate the principal componets of these genes.
rowNoNas <- which(apply(standardizeByCancerType, 1, function(row)return(sum(is.na(row)))) == 0)
noNasLowVar <- intersect(rownames(standardizeByCancerType[rowNoNas, ]), veryLowVarExpressed)
ruvPcs_standardized <- prcomp(t(standardizeByCancerType[noNasLowVar[-1], ]))
As a sanity check, are these RUV components actually correlated with batch ID, if they are, that is a very good thing, because it shows that these principal components have identified the real batches in a completely unbiased way.
batchIds <- sapply(strsplit(colnames(allExprData), ".", fixed=T), function(l)l[6])
summary(lm(ruvPcs_standardized$x[,1]~factor(batchIds)+factor(cancerTypesVec)))
##
## Call:
## lm(formula = ruvPcs_standardized$x[, 1] ~ factor(batchIds) +
## factor(cancerTypesVec))
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.605 -4.525 -0.774 3.649 67.418
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73926 2.80662 0.263 0.792248
## factor(batchIds)0734 3.98220 2.96791 1.342 0.179708
## factor(batchIds)0735 -3.64563 3.06061 -1.191 0.233625
## factor(batchIds)0736 -3.36769 3.01498 -1.117 0.264027
## factor(batchIds)0740 -4.74375 3.18509 -1.489 0.136425
## factor(batchIds)0744 -4.69347 7.74815 -0.606 0.544692
## factor(batchIds)0748 -0.55918 3.79430 -0.147 0.882839
## factor(batchIds)0751 -7.36880 7.74815 -0.951 0.341608
## factor(batchIds)0760 7.85904 4.07062 1.931 0.053552 .
## factor(batchIds)0851 -0.39881 1.92265 -0.207 0.835680
## factor(batchIds)0864 0.62718 2.77765 0.226 0.821365
## factor(batchIds)0905 -1.17818 7.49732 -0.157 0.875133
## factor(batchIds)0946 1.46296 2.37136 0.617 0.537296
## factor(batchIds)0980 0.84730 1.98304 0.427 0.669188
## factor(batchIds)1100 0.91244 2.68797 0.339 0.734276
## factor(batchIds)1107 1.30646 2.80735 0.465 0.641675
## factor(batchIds)1188 2.63291 3.65241 0.721 0.471007
## factor(batchIds)1193 -0.60435 3.50185 -0.173 0.862984
## factor(batchIds)1201 -0.72094 2.31975 -0.311 0.755971
## factor(batchIds)1206 0.65954 2.15171 0.307 0.759216
## factor(batchIds)1277 4.25383 2.42079 1.757 0.078914 .
## factor(batchIds)1289 2.83771 2.38546 1.190 0.234238
## factor(batchIds)1305 5.12186 2.39938 2.135 0.032813 *
## factor(batchIds)1325 1.62862 2.40440 0.677 0.498200
## factor(batchIds)1334 5.49288 2.39938 2.289 0.022083 *
## factor(batchIds)1351 -3.68577 2.96410 -1.243 0.213724
## factor(batchIds)1410 -1.91816 2.70423 -0.709 0.478145
## factor(batchIds)1420 2.40926 2.39938 1.004 0.315345
## factor(batchIds)1426 4.82057 2.40440 2.005 0.045002 *
## factor(batchIds)1436 -1.26667 2.19549 -0.577 0.563993
## factor(batchIds)1443 1.28665 2.11889 0.607 0.543713
## factor(batchIds)1470 -3.29349 2.29245 -1.437 0.150843
## factor(batchIds)1503 -0.87390 2.33185 -0.375 0.707842
## factor(batchIds)1514 -1.74507 2.46793 -0.707 0.479521
## factor(batchIds)1541 -0.39822 2.28346 -0.174 0.861559
## factor(batchIds)1564 1.08284 3.18509 0.340 0.733886
## factor(batchIds)1565 0.75995 2.99024 0.254 0.799390
## factor(batchIds)1566 -1.39987 2.92950 -0.478 0.632764
## factor(batchIds)1567 -1.15681 2.97678 -0.389 0.697572
## factor(batchIds)1568 -1.29774 2.93458 -0.442 0.658336
## factor(batchIds)1569 0.50741 3.12088 0.163 0.870848
## factor(batchIds)1580 -4.25263 2.07120 -2.053 0.040078 *
## factor(batchIds)1592 0.56832 2.23155 0.255 0.798980
## factor(batchIds)1628 1.77448 2.22220 0.799 0.424588
## factor(batchIds)1635 0.44015 2.08480 0.211 0.832794
## factor(batchIds)1653 -4.02634 2.28877 -1.759 0.078580 .
## factor(batchIds)1660 -2.33408 2.95366 -0.790 0.429411
## factor(batchIds)1672 0.21245 2.34783 0.090 0.927901
## factor(batchIds)1686 -1.46570 2.20349 -0.665 0.505957
## factor(batchIds)1708 -2.08487 2.30641 -0.904 0.366046
## factor(batchIds)1723 -2.13928 2.22892 -0.960 0.337188
## factor(batchIds)1736 -2.07429 2.73030 -0.760 0.447435
## factor(batchIds)1755 0.36872 2.06141 0.179 0.858045
## factor(batchIds)1758 8.40296 2.37094 3.544 0.000396 ***
## factor(batchIds)1766 18.30672 5.54307 3.303 0.000961 ***
## factor(batchIds)1774 -3.57448 2.36370 -1.512 0.130506
## factor(batchIds)1789 2.75958 2.36836 1.165 0.243971
## factor(batchIds)1820 1.88157 1.98304 0.949 0.342730
## factor(batchIds)1830 -0.87982 2.76804 -0.318 0.750606
## factor(batchIds)1839 -3.36354 2.24857 -1.496 0.134724
## factor(batchIds)1849 -1.04900 2.92046 -0.359 0.719462
## factor(batchIds)1850 -1.21892 2.93284 -0.416 0.677704
## factor(batchIds)1858 2.37621 1.98993 1.194 0.232461
## factor(batchIds)1873 -1.79530 2.25019 -0.798 0.424980
## factor(batchIds)1896 -1.23501 2.41403 -0.512 0.608947
## factor(batchIds)1915 3.21269 2.12743 1.510 0.131044
## factor(batchIds)1928 -6.16168 2.32038 -2.655 0.007933 **
## factor(batchIds)1949 2.06989 1.88075 1.101 0.271112
## factor(batchIds)1965 1.15758 2.04026 0.567 0.570477
## factor(batchIds)2005 3.49744 3.40626 1.027 0.304555
## factor(batchIds)2016 1.95885 2.01714 0.971 0.331522
## factor(batchIds)2027 -0.43366 2.18191 -0.199 0.842459
## factor(batchIds)2039 -1.59228 2.17260 -0.733 0.463641
## factor(batchIds)2045 2.04146 2.00441 1.018 0.308474
## factor(batchIds)2066 -2.14158 2.21898 -0.965 0.334510
## factor(batchIds)2081 1.79607 2.22073 0.809 0.418665
## factor(batchIds)2090 -0.46028 2.30441 -0.200 0.841689
## factor(batchIds)2118 -0.68481 2.04732 -0.334 0.738016
## factor(batchIds)2125 0.23771 1.94617 0.122 0.902787
## factor(batchIds)2132 -2.64760 2.37824 -1.113 0.265625
## factor(batchIds)2139 -3.06189 2.81364 -1.088 0.276521
## factor(batchIds)2156 -0.19170 2.41670 -0.079 0.936776
## factor(batchIds)2170 2.14276 2.34743 0.913 0.361366
## factor(batchIds)2187 -3.03194 2.15351 -1.408 0.159191
## factor(batchIds)2204 -2.32925 2.55178 -0.913 0.361373
## factor(batchIds)2213 5.15920 3.50607 1.472 0.141187
## factor(batchIds)2232 -5.16590 2.46793 -2.093 0.036356 *
## factor(batchIds)2241 -2.25301 2.15171 -1.047 0.295090
## factor(batchIds)2247 -0.80170 1.94617 -0.412 0.680392
## factor(batchIds)2256 -2.33235 2.21486 -1.053 0.292345
## factor(batchIds)2263 -0.81667 2.35135 -0.347 0.728358
## factor(batchIds)2287 -1.48645 2.49153 -0.597 0.550789
## factor(batchIds)2296 0.13236 2.22875 0.059 0.952643
## factor(batchIds)2315 9.25186 7.49587 1.234 0.217136
## factor(batchIds)2326 -1.24585 2.08928 -0.596 0.550983
## factor(batchIds)2347 -0.20800 7.45745 -0.028 0.977749
## factor(batchIds)2403 -1.08750 1.85793 -0.585 0.558338
## factor(batchIds)2404 0.26022 2.11804 0.123 0.902220
## factor(batchIds)A00Z -2.20523 1.99401 -1.106 0.268785
## factor(batchIds)A034 1.08827 1.98825 0.547 0.584151
## factor(batchIds)A056 0.27089 2.12126 0.128 0.898388
## factor(batchIds)A083 -1.92732 4.63019 -0.416 0.677236
## factor(batchIds)A084 -0.59944 2.11953 -0.283 0.777322
## factor(batchIds)A089 2.46210 2.42357 1.016 0.309704
## factor(batchIds)A104 -4.38144 5.45736 -0.803 0.422081
## factor(batchIds)A109 -3.69855 2.17425 -1.701 0.088962 .
## factor(batchIds)A10J -0.26393 2.32294 -0.114 0.909543
## factor(batchIds)A10U 2.18090 2.13238 1.023 0.306448
## factor(batchIds)A115 -2.25047 1.98921 -1.131 0.257939
## factor(batchIds)A118 8.00579 5.45736 1.467 0.142416
## factor(batchIds)A12D -1.81045 2.06381 -0.877 0.380380
## factor(batchIds)A12P -0.79974 2.07106 -0.386 0.699392
## factor(batchIds)A131 2.12641 2.08536 1.020 0.307904
## factor(batchIds)A137 -0.57769 2.31825 -0.249 0.803219
## factor(batchIds)A13Q -0.47826 1.99738 -0.239 0.810765
## factor(batchIds)A13S -0.47515 4.59238 -0.103 0.917596
## factor(batchIds)A13Y -2.90263 2.11249 -1.374 0.169462
## factor(batchIds)A144 -1.05177 2.12694 -0.495 0.620963
## factor(batchIds)A14D -0.93754 2.24910 -0.417 0.676795
## factor(batchIds)A14M -0.33532 2.20912 -0.152 0.879357
## factor(batchIds)A14Y 0.26285 2.09836 0.125 0.900316
## factor(batchIds)A155 0.71804 2.36374 0.304 0.761307
## factor(batchIds)A157 1.62664 2.17962 0.746 0.455508
## factor(batchIds)A169 -1.13819 2.11056 -0.539 0.589704
## factor(batchIds)A16F -0.47759 2.19368 -0.218 0.827658
## factor(batchIds)A16R 1.53046 2.26000 0.677 0.498299
## factor(batchIds)A16W -0.83529 2.52156 -0.331 0.740457
## factor(batchIds)A17B -0.51891 2.93355 -0.177 0.859600
## factor(batchIds)A180 -0.04023 2.00702 -0.020 0.984009
## factor(batchIds)A18C -3.27862 2.23604 -1.466 0.142608
## factor(batchIds)A18M 1.20246 2.10719 0.571 0.568252
## factor(batchIds)A18S -4.56940 2.00545 -2.278 0.022719 *
## factor(batchIds)A18T -3.86305 2.00250 -1.929 0.053746 .
## factor(batchIds)A18U -3.70030 2.28025 -1.623 0.104673
## factor(batchIds)A19E 3.94681 7.45297 0.530 0.596427
## factor(batchIds)A19O -2.29806 2.11445 -1.087 0.277136
## factor(batchIds)A19W -0.78425 2.19953 -0.357 0.721433
## factor(batchIds)A206 1.82135 2.17909 0.836 0.403271
## factor(batchIds)A20F -2.70013 2.03945 -1.324 0.185551
## factor(batchIds)A213 1.52621 1.98811 0.768 0.442703
## factor(batchIds)A21D -1.25177 2.09268 -0.598 0.549741
## factor(batchIds)A21T 1.37275 2.11523 0.649 0.516363
## factor(batchIds)A220 -0.19902 2.17000 -0.092 0.926927
## factor(batchIds)A22K -1.45471 2.02252 -0.719 0.471998
## factor(batchIds)A22L 0.20190 2.04744 0.099 0.921450
## factor(batchIds)A22U -1.26017 2.02201 -0.623 0.533151
## factor(batchIds)A239 -1.59850 2.19079 -0.730 0.465623
## factor(batchIds)A23N 0.49264 2.03892 0.242 0.809082
## factor(batchIds)A23W 1.12381 2.20901 0.509 0.610946
## factor(batchIds)A24H 0.35351 1.87542 0.188 0.850491
## factor(batchIds)A24K -2.44527 4.62295 -0.529 0.596858
## factor(batchIds)A24X -1.92516 1.94537 -0.990 0.322389
## factor(batchIds)A24Z -0.72651 2.09750 -0.346 0.729070
## factor(batchIds)A250 1.76513 2.06199 0.856 0.392001
## factor(batchIds)A260 0.23872 4.51422 0.053 0.957828
## factor(batchIds)A262 -0.65534 1.97785 -0.331 0.740395
## factor(batchIds)A266 -0.86241 1.95918 -0.440 0.659811
## factor(batchIds)A26B 14.92300 5.42218 2.752 0.005930 **
## factor(batchIds)A26T 0.99826 2.14309 0.466 0.641366
## factor(batchIds)A26U -1.41654 2.05964 -0.688 0.491620
## factor(batchIds)A26W -0.68384 1.88951 -0.362 0.717425
## factor(batchIds)A27Q 0.63717 1.86256 0.342 0.732289
## factor(batchIds)A27V -0.60864 2.23730 -0.272 0.785596
## factor(batchIds)A28H -2.19949 2.09191 -1.051 0.293089
## factor(batchIds)A28J -3.01794 4.78540 -0.631 0.528280
## factor(batchIds)A28M -1.59889 2.03128 -0.787 0.431223
## factor(batchIds)A28V -0.97177 2.16138 -0.450 0.653006
## factor(batchIds)A29R -2.05581 2.00604 -1.025 0.305479
## factor(batchIds)A29S -0.73926 2.92187 -0.253 0.800266
## factor(batchIds)A30B 1.55558 2.10632 0.739 0.460210
## factor(batchIds)A30C -0.60505 2.14070 -0.283 0.777456
## factor(batchIds)A311 -1.00231 2.10043 -0.477 0.633236
## factor(batchIds)A31N 1.49718 1.99488 0.751 0.452965
## factor(batchIds)A31O -1.02893 2.17445 -0.473 0.636088
## factor(batchIds)A31P -3.45874 4.90955 -0.704 0.481143
## factor(batchIds)A32O -2.62164 2.02831 -1.293 0.196206
## factor(batchIds)A32P -2.46539 2.10234 -1.173 0.240949
## factor(batchIds)A32Q 0.66357 2.00625 0.331 0.740840
## factor(batchIds)A32Y 2.70046 2.15338 1.254 0.209852
## factor(batchIds)A32Z -0.05548 2.17079 -0.026 0.979609
## factor(batchIds)A336 -1.25598 4.61072 -0.272 0.785317
## factor(batchIds)A33J -3.38984 2.02493 -1.674 0.094153 .
## factor(batchIds)A33R -3.40293 2.12742 -1.600 0.109731
## factor(batchIds)A33Z -5.07581 2.04095 -2.487 0.012900 *
## factor(batchIds)A34F -2.17076 2.13293 -1.018 0.308828
## factor(batchIds)A34R -2.82843 1.96401 -1.440 0.149863
## factor(batchIds)A352 -1.91821 1.96360 -0.977 0.328651
## factor(batchIds)A354 -3.41220 4.90955 -0.695 0.487063
## factor(batchIds)A355 -1.72896 2.27115 -0.761 0.446514
## factor(batchIds)A35K 0.86586 2.90379 0.298 0.765571
## factor(batchIds)A35L -2.36164 2.90482 -0.813 0.416232
## factor(batchIds)A36D -4.97088 4.57977 -1.085 0.277771
## factor(batchIds)A36F -1.20903 1.97267 -0.613 0.539963
## factor(batchIds)A36G -1.19658 2.08741 -0.573 0.566498
## factor(batchIds)A36H -2.48192 2.08712 -1.189 0.234404
## factor(batchIds)A37I -4.52811 4.49726 -1.007 0.314027
## factor(batchIds)A37K -3.05175 1.96226 -1.555 0.119926
## factor(batchIds)A37L -1.81767 1.98794 -0.914 0.360556
## factor(batchIds)A37O -1.96109 2.09034 -0.938 0.348182
## factor(batchIds)A38B -2.42050 1.97811 -1.224 0.221117
## factor(batchIds)A38C -1.09035 2.00870 -0.543 0.587270
## factor(batchIds)A38D -5.81476 4.86102 -1.196 0.231647
## factor(batchIds)A39D -1.41845 1.94820 -0.728 0.466580
## factor(batchIds)A39I -3.40542 2.01265 -1.692 0.090677 .
## factor(batchIds)A405 -0.73926 2.68797 -0.275 0.783302
## factor(batchIds)A406 -5.14032 3.69761 -1.390 0.164508
## factor(batchIds)A40A -2.74683 2.23809 -1.227 0.219736
## factor(batchIds)A41B -1.37837 1.95529 -0.705 0.480864
## factor(batchIds)A41C -1.00200 2.02845 -0.494 0.621336
## factor(batchIds)A41I -1.21978 2.42938 -0.502 0.615612
## factor(batchIds)A41O -1.82473 2.05613 -0.887 0.374852
## factor(batchIds)A42C 0.79491 2.56047 0.310 0.756221
## factor(batchIds)A42S 0.09915 2.11129 0.047 0.962546
## factor(batchIds)A42T -2.80449 1.99522 -1.406 0.159873
## factor(batchIds)A430 -1.65349 2.90379 -0.569 0.569080
## factor(batchIds)A431 0.64683 2.95269 0.219 0.826605
## factor(batchIds)A466 10.65288 2.89303 3.682 0.000232 ***
## factor(cancerTypesVec)BLCA 0.76095 2.52594 0.301 0.763226
## factor(cancerTypesVec)BRCA 0.03417 2.50995 0.014 0.989139
## factor(cancerTypesVec)CESC 0.27004 2.53059 0.107 0.915021
## factor(cancerTypesVec)CHOL 0.48052 3.15765 0.152 0.879052
## factor(cancerTypesVec)COAD 1.90028 2.63873 0.720 0.471451
## factor(cancerTypesVec)DLBC -3.51345 3.48344 -1.009 0.313183
## factor(cancerTypesVec)ESCA 2.29545 3.31826 0.692 0.489103
## factor(cancerTypesVec)GBM NA NA NA NA
## factor(cancerTypesVec)HNSC -0.52527 2.52837 -0.208 0.835429
## factor(cancerTypesVec)KICH -9.87751 7.64973 -1.291 0.196657
## factor(cancerTypesVec)KIRC -2.66770 2.74902 -0.970 0.331863
## factor(cancerTypesVec)KIRP 0.03291 2.54559 0.013 0.989686
## factor(cancerTypesVec)LAML NA NA NA NA
## factor(cancerTypesVec)LGG 0.85681 2.52322 0.340 0.734186
## factor(cancerTypesVec)LIHC -0.08793 2.53052 -0.035 0.972283
## factor(cancerTypesVec)LUAD -0.94167 2.45957 -0.383 0.701832
## factor(cancerTypesVec)LUSC -1.00369 2.32239 -0.432 0.665618
## factor(cancerTypesVec)MESO 1.75972 2.81392 0.625 0.531748
## factor(cancerTypesVec)OV NA NA NA NA
## factor(cancerTypesVec)PAAD 0.65451 2.56458 0.255 0.798565
## factor(cancerTypesVec)PCPG NA NA NA NA
## factor(cancerTypesVec)PRAD 0.57263 2.53627 0.226 0.821379
## factor(cancerTypesVec)READ 1.22948 2.92675 0.420 0.674432
## factor(cancerTypesVec)SARC 0.15817 2.52394 0.063 0.950034
## factor(cancerTypesVec)SKCM 2.10772 2.52405 0.835 0.403706
## factor(cancerTypesVec)TGCT NA NA NA NA
## factor(cancerTypesVec)THCA -0.60615 2.55454 -0.237 0.812443
## factor(cancerTypesVec)THYM -1.43248 2.97990 -0.481 0.630730
## factor(cancerTypesVec)UCEC -0.24655 2.57182 -0.096 0.923628
## factor(cancerTypesVec)UCS 0.23250 2.91486 0.080 0.936426
## factor(cancerTypesVec)UVM NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.222 on 9726 degrees of freedom
## Multiple R-squared: 0.05976, Adjusted R-squared: 0.03647
## F-statistic: 2.565 on 241 and 9726 DF, p-value: < 2.2e-16
theRsquareds <- numeric()
thePvals <- numeric()
for(i in 1:100)
{
theMod <- lm(ruvPcs_standardized$x[,i]~factor(batchIds))
theRsquareds[i] <- summary(theMod)$r.squared
thePvals[i] <- lmp(theMod)
}
print(theRsquareds[1:10])
## [1] 0.05576464 0.22312837 0.17869422 0.11473238 0.06189570 0.04170722
## [7] 0.04977357 0.06469582 0.05487097 0.05170802
print(thePvals[1:10])
## [1] 4.025558e-33 0.000000e+00 1.701332e-274 2.945888e-136 4.294664e-42
## [6] 3.147887e-15 5.609054e-25 2.236910e-46 7.269734e-32 1.562697e-27
Supplementary Figure: the proportion of variabilty in “batch” caputred by each of the RUV principal components. This clearly levels off after 10, suggesting that 10 PCs, the number proposed by the original authors, is appropriate in this case.
pdf(paste(theRootDir, "figures/theRsquareds_ruv_components_against_batch.pdf", sep=""), width=4, height=4)
plot(theRsquareds, pch=20, ylab="R squared", xlab="Principal Component")
abline(v=10, col="red")
dev.off()
## png
## 2
finally, for each gene, regress out the 10 RUV PCs that we have calculated on the matrix of standardized expression data, then save this updated matrx.
tenRuvNewStandardApproach <- allExprData
for(i in 1:nrow(allExprData))
{
tenRuvNewStandardApproach[i,] <- residuals(glm(allExprData[i,]~ruvPcs_standardized$x[, 1:10], family="quasipoisson"))
}
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
save(tenRuvNewStandardApproach, cancerTypesVec, file=paste(theRootDir, "dataIn/tenRuvNewStandardApproach.RData", sep=""))