This document contains supplemental R code for reproducing results from Maftools: Efficient and comprehensive analysis of somatic variants in cancer
Functions from maftools are called with the namespace maftools::. See SessionInfo at the end of the document for complete package details.
#Install maftools from Bioconductor
source("https://bioconductor.org/biocLite.R")
biocLite("maftools")
#For specific version of maftools used in the manuscript (v1.6.05), install from frozen GitHub pub branch
# library(devtools)
# devtools::install_github(repo = "PoisonAlien/maftools", ref = "pub")
library(maftools)
library(NMF)
library(VennDiagram)
library(Vennerable)
library(ggplot2)
#ESCA
system.time(expr = esca <- maftools::read.maf(maf = "TCGA_ESCA.maf.gz", clinicalData = "TCGA_ESCA_clinical.tsv", verbose = FALSE))
## reading maf..
## NOTE: Non MAF specific values in Variant_Classification column:
## [1] "Start_Codon_Del" "Stop_Codon_Ins"
## Done !
## user system elapsed
## 1.924 0.064 1.998
#Seperate ESCA into EAC or ESCC cohorts based on histlogy
esca.clinical = maftools::getClinicalData(x = esca)
esca.sq.tsbs = esca.clinical[primary_pathology_histological_type %in% "Esophagus_Squamous_Cell_Carcinoma", Tumor_Sample_Barcode]
esca.ad.tsbs = esca.clinical[!primary_pathology_histological_type %in% "Esophagus_Squamous_Cell_Carcinoma", Tumor_Sample_Barcode]
escc = maftools::subsetMaf(maf = esca, tsb = esca.sq.tsbs, mafObj = TRUE)
eac = maftools::subsetMaf(maf = esca, tsb = esca.ad.tsbs, mafObj = TRUE)
#BRCA WGS MAF
system.time(expr = brca <- maftools::read.maf(maf = "TCGA_BRCA_WGS.maf.gz", verbose = FALSE))
## reading maf..
## Done !
## user system elapsed
## 2.396 0.104 2.508
#LAML MAF
system.time(expr = laml <- maftools::read.maf(maf = "TCGA_LAML.maf.gz", clinicalData = "TCGA_LAML_clinical.tsv", verbose = FALSE))
## reading maf..
## Done !
## user system elapsed
## 0.200 0.036 0.243
hist.col = list(Histology = c('EAC' = '#FB8072', 'ESCC' = '#80B1D3'))
#pdf(file = "esca_oncoplot.pdf", width = 10, height = 7, paper = "special", bg = "white")
maftools::oncoplot(maf = esca, clinicalFeatures = "Histology", mutsig = 'ESCA_MutSigCV/ESCA-TP.sig_genes.txt.gz', sortByAnnotation = TRUE, mutsigQval = 0.1, annotationColor = hist.col, fontSize = 10)
#dev.off()
esca.titv = maftools::titv(maf = esca, plot = FALSE)
plotTiTv(res = esca.titv)
## NULL
maftools::lollipopPlot(maf = esca, gene = "TP53", labelPos = c(248, 273, 175))
For representative sample in figure 1D
maftools::rainfallPlot(maf = brca, tsb = "TCGA-A8-A08B", detectChangePoints = TRUE)
## Processing TCGA-A8-A08B..
## Change points detected at:
## Chromosome Start_Position End_Position nMuts Avg_intermutation_dist
## 1: 8 98000822 98456466 33 14238.88
## 2: 8 124090377 124096810 22 306.33
## 3: 8 136085710 140768420 16 312180.67
## 4: 17 28925504 34484595 25 231628.79
## 5: 17 38619251 40070193 13 120911.83
## 6: 23 108609210 110760883 6 430334.60
## Size C>G C>T C>A T>A T>C T>G Filter
## 1: 455644 11 22 NA NA NA NA .
## 2: 6433 1 21 NA NA NA NA PASS
## 3: 4682710 2 10 4 NA NA NA .
## 4: 5559091 7 12 2 1 2 1 .
## 5: 1450942 3 9 1 NA NA NA .
## 6: 2151673 1 2 1 NA 1 1 .
Run for all samples and prepare Supplemental_Table_SS1
#Get all sample IDs
brca.tsbs = as.character(maftools::getSampleSummary(x = brca)[,Tumor_Sample_Barcode])
#Apply rainfallPlot function on all samples
brca.kats = lapply(brca.tsbs, function(x) {maftools::rainfallPlot(maf = brca, detectChangePoints = TRUE, tsb = x)})
#Gather all output files generated from above command
brca.cps = list.files(pattern = "_changePoints")
#Read and merge into single table
brca.kats = data.table::rbindlist(lapply(list.files(pattern = "_changePoints"), data.table::fread), fill = TRUE)
write.table(brca.kats, file = "Supplemental_Table_S1.tsv", sep = '\t', quote = F, row.names = F)
Signature analysis
hg19.fa = "/bigdisk/ref_db/hg19/fa/hg19.fa" #set path to hg19 reference genome
#For ESCC: extract adjacent bases and signatures
system.time(expr = escc.tnm <- maftools::trinucleotideMatrix(maf = escc, ref_genome = hg19.fa, prefix = "chr", add = TRUE))
## reading /bigdisk/ref_db/hg19/fa/hg19.fa (this might take few minutes)..
## Warning in maftools::trinucleotideMatrix(maf = escc, ref_genome =
## hg19.fa, : Chromosome names in MAF must match chromosome names in reference
## fasta. Ignorinig 554 single nucleotide variants from missing chromosomes
## chr23, chr24
## Extracting 5' and 3' adjacent bases..
## Extracting +/- 20bp around mutated bases for background C>T estimation..
## Estimating APOBEC enrichment scores..
## Performing one-way Fisher's test for APOBEC enrichment..
## APOBEC related mutations are enriched in 26.042% of samples (APOBEC enrichment score > 2 ; 25 of 96 samples)
## Creating mutation matrix..
## matrix of dimension 96x96
## user system elapsed
## 17.256 2.252 29.619
system.time(expr = escc.sig <- maftools::extractSignatures(mat = escc.tnm, parallel = "P3", nTry = 8))
## Estimating best rank..
## method seed rng metric rank sparseness.basis sparseness.coef rss
## 1: brunet random 1 KL 2 0.5306021 0.5208087 39669.08
## 2: brunet random 6 KL 3 0.5680904 0.3887575 26765.15
## 3: brunet random 5 KL 4 0.5496541 0.4065412 22801.67
## 4: brunet random 3 KL 5 0.5526710 0.4254033 20616.83
## 5: brunet random 6 KL 6 0.5737730 0.4517907 18766.57
## 6: brunet random 4 KL 7 0.5798829 0.4761835 17579.53
## 7: brunet random 2 KL 8 0.5932377 0.4835888 16403.01
## evar silhouette.coef silhouette.basis residuals niter cpu
## 1: 0.7629815 1.0000000 1.0000000 5616.754 470 0.124
## 2: 0.8400811 0.7618033 0.8200777 4868.709 450 0.148
## 3: 0.8637625 0.6053517 0.6598350 4583.411 700 0.248
## 4: 0.8768166 0.5372801 0.5747045 4349.541 660 0.260
## 5: 0.8878717 0.5101924 0.5328021 4156.612 1530 0.564
## 6: 0.8949642 0.4382449 0.5179072 3992.185 1260 0.564
## 7: 0.9019938 0.3959969 0.4617969 3842.871 2000 1.004
## cpu.all nrun cophenetic dispersion silhouette.consensus
## 1: 4.816 10 0.9995303 0.9851736 0.9955865
## 2: 5.316 10 0.9869938 0.9260590 0.9479500
## 3: 5.568 10 0.9865768 0.7890712 0.8956212
## 4: 7.124 10 0.9633915 0.8023177 0.7803073
## 5: 7.616 10 0.9618033 0.8063108 0.7481140
## 6: 8.148 10 0.9349485 0.7958073 0.6424952
## 7: 9.244 10 0.9314505 0.8155295 0.5845807
## Using 3 as a best-fit rank based on decreasing cophenetic correlation coefficient.
## Comparing against experimentally validated 30 signatures.. (See http://cancer.sanger.ac.uk/cosmic/signatures for details.)
## Found Signature_1 most similar to validated Signature_13. Aetiology: APOBEC Cytidine Deaminase (C>G) [cosine-similarity: 0.838]
## Found Signature_2 most similar to validated Signature_6. Aetiology: defective DNA mismatch repair [cosine-similarity: 0.929]
## Found Signature_3 most similar to validated Signature_4. Aetiology: exposure to tobacco (smoking) mutagens [cosine-similarity: 0.881]
## user system elapsed
## 74.080 8.812 39.370
#For ESCA
system.time(eac.tnm <- maftools::trinucleotideMatrix(maf = eac, ref_genome = hg19.fa, prefix = "chr", add = TRUE))
## reading /bigdisk/ref_db/hg19/fa/hg19.fa (this might take few minutes)..
## Warning in maftools::trinucleotideMatrix(maf = eac, ref_genome = hg19.fa, :
## Chromosome names in MAF must match chromosome names in reference fasta.
## Ignorinig 840 single nucleotide variants from missing chromosomes chr23,
## chr24
## Extracting 5' and 3' adjacent bases..
## Extracting +/- 20bp around mutated bases for background C>T estimation..
## Estimating APOBEC enrichment scores..
## Performing one-way Fisher's test for APOBEC enrichment..
## APOBEC related mutations are enriched in 0% of samples (APOBEC enrichment score > 2 ; 0 of 89 samples)
## Creating mutation matrix..
## matrix of dimension 89x96
## user system elapsed
## 10.636 1.192 11.899
system.time(eac.sig <- maftools::extractSignatures(mat = eac.tnm, parallel = "P3", nTry = 8))
## Estimating best rank..
## method seed rng metric rank sparseness.basis sparseness.coef rss
## 1: brunet random 4 KL 2 0.5009027 0.3026437 90879.21
## 2: brunet random 5 KL 3 0.5554445 0.3172066 36610.05
## 3: brunet random 5 KL 4 0.5701976 0.3405838 28935.22
## 4: brunet random 3 KL 5 0.5773712 0.4119355 25128.97
## 5: brunet random 1 KL 6 0.5948495 0.4140505 21480.11
## 6: brunet random 2 KL 7 0.5944406 0.4339903 20042.54
## 7: brunet random 5 KL 8 0.5829818 0.4561130 19561.32
## evar silhouette.coef silhouette.basis residuals niter cpu
## 1: 0.8452432 1.0000000 1.0000000 5943.593 770 0.196
## 2: 0.9376573 0.7506887 0.7959748 4864.049 640 0.176
## 3: 0.9507267 0.6018015 0.7691925 4335.320 1230 0.368
## 4: 0.9572083 0.5199186 0.6432341 4085.009 1440 0.484
## 5: 0.9634219 0.4715414 0.5706040 3890.788 800 0.312
## 6: 0.9658699 0.4173380 0.4980237 3756.538 1120 0.472
## 7: 0.9666893 0.3578377 0.4540708 3632.213 1520 1.136
## cpu.all nrun cophenetic dispersion silhouette.consensus
## 1: 4.100 10 0.8900892 0.3489509 0.6171745
## 2: 4.208 10 0.9690168 0.8383537 0.8795046
## 3: 4.984 10 0.9795232 0.8794900 0.8935544
## 4: 5.668 10 0.9777927 0.8843479 0.8726386
## 5: 5.932 10 0.9353045 0.7715238 0.6979691
## 6: 6.872 10 0.9365607 0.7826537 0.6196948
## 7: 9.544 10 0.8824548 0.7445272 0.4442875
## Using 5 as a best-fit rank based on decreasing cophenetic correlation coefficient.
## Comparing against experimentally validated 30 signatures.. (See http://cancer.sanger.ac.uk/cosmic/signatures for details.)
## Found Signature_1 most similar to validated Signature_3. Aetiology: defects in DNA-DSB repair by HR [cosine-similarity: 0.792]
## Found Signature_2 most similar to validated Signature_17. Aetiology: Unknown [cosine-similarity: 0.979]
## Found Signature_3 most similar to validated Signature_6. Aetiology: defective DNA mismatch repair [cosine-similarity: 0.952]
## Found Signature_4 most similar to validated Signature_29. Aetiology: exposure to tobacco (chewing) mutagens [cosine-similarity: 0.85]
## Found Signature_5 most similar to validated Signature_1. Aetiology: spontaneous deamination of 5-methylcytosine [cosine-similarity: 0.932]
## user system elapsed
## 69.804 9.308 35.319
#Write APOBEC enrichment results
write.table(x = escc.tnm$APOBEC_scores, file = "Supplemental_Table_S2.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
Plot signatures
maftools::plotSignatures(nmfRes = escc.sig)
maftools::plotSignatures(nmfRes = eac.sig)
Plot differences between APOBEC enriched and non APOBEC enriched samples.
escc.apobec.diff = maftools::plotApobecDiff(tnm = escc.tnm, maf = escc, title_size = 1.1)
Perform signature enrichment analysis
system.time(expr = escc.sig.enrichment <- maftools::signatureEnrichment(maf = escc, sig_res = escc.sig))
## Running k-means for signature assignment..
## Performing pairwise and groupwise comparisions..
## Sample size per factor in Signature:
##
## Signature_1 Signature_2 Signature_3
## 18 56 22
## Estimating mutation load and signature exposures..
## user system elapsed
## 5.968 0.004 6.011
system.time(expr = eac.sig.enrichment <- maftools::signatureEnrichment(maf = eac, sig_res = eac.sig))
## Running k-means for signature assignment..
## Performing pairwise and groupwise comparisions..
## Sample size per factor in Signature:
##
## Signature_1 Signature_2 Signature_3 Signature_4 Signature_5
## 9 15 18 16 31
## Estimating mutation load and signature exposures..
## user system elapsed
## 22.280 0.032 22.431
write.table(x = escc.sig.enrichment$groupwise_comparision, file = "Supplemental_Table_S3.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
escc.sig.genes = escc.sig.enrichment$groupwise_comparision[p_value < 0.01, Hugo_Symbol]
#Get all samples associated with Signature 2
escc.sig2.tsbs = escc.sig.enrichment$Signature_Assignment[Signature %in% 'Signature_2', Tumor_Sample_Barcode]
#Rest of the samples
rest.tsbs = escc.sig.enrichment$Signature_Assignment[!Tumor_Sample_Barcode %in% escc.sig2.tsbs, Tumor_Sample_Barcode]
#Generate MAF object for above samples
escc.sig2 = maftools::subsetMaf(maf = escc, tsb = escc.sig2.tsbs, mafObj = TRUE)
escc.res = maftools::subsetMaf(maf = escc, tsb = rest.tsbs, mafObj = TRUE)
#Draw side-by-side oncoplot for TGFBR2 samples
maftools::coOncoplot(m1 = escc.sig2, m2 = escc.res, genes = c("TGFBR2", "ZNF292"), m1Name = "Signture_6 like", m2Name = "Rest", removeNonMutated = FALSE, geneNamefont = 16)
Note: Above plot has been cropped in the manuscript to show only TGFBR2 mutations.
#TGFBR2 mutations in ESCC domain
maftools::lollipopPlot(maf = escc, gene = "TGFBR2", labelPos = 'all', cBioPortal = TRUE)
## HGNC refseq.ID protein.ID aa.length
## 1: TGFBR2 NM_001024847 NP_001020018 592
## 2: TGFBR2 NM_003242 NP_003233 567
Compare ESCC and EAC to identify differentially mutated genes
system.time(expr = escc.vs.eac <- maftools::mafCompare(m1 = escc, m2 = eac, m1Name = "ESCC", m2Name = "EAC", minMut = 5))
## user system elapsed
## 0.692 0.000 0.699
#All significantly differentially mutated genes (P-Value < 0.01)
maftools::forestPlot(escc.vs.eac, pVal = 0.01, color = as.character(unlist(hist.col)), geneFontSize = 0.7)
write.table(x = escc.vs.eac$results, file = "Supplemental_Table_S4.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
VGSC.genes = c('SCN5A', 'SCN3A', 'SCN9A')
erbb.sig.path = c('ERBB2', 'ERBB4', 'TP53')
maftools::coOncoplot(m1 = eac, m2 = escc, genes = erbb.sig.path, m1Name = "EAC", m2Name = "ESCC", keepGeneOrder = TRUE, removeNonMutated = FALSE)
maftools::coOncoplot(m1 = eac, m2 = escc, genes = VGSC.genes, m1Name = "EAC", m2Name = "ESCC", keepGeneOrder = TRUE, removeNonMutated = FALSE)
Pfam domain enrichment
eac.pfam = maftools::pfamDomains(maf = eac, top = 10)
## Assuming protein change information are stored under column Protein_Change. Use argument AACol to override if necessary.
## Removed 425 mutations for which AA position was not available
escc.pfam = maftools::pfamDomains(maf = escc, top = 10)
## Assuming protein change information are stored under column Protein_Change. Use argument AACol to override if necessary.
## Removed 350 mutations for which AA position was not available
escc.eac.top10.domains = data.table::rbindlist(list(ESCC = escc.pfam$domainSummary[1:10], EAC = eac.pfam$domainSummary[1:10]), idcol = "Disease")
write.table(x = escc.eac.top10.domains, file = "Supplemental_Table_S5.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
Somatic interactions - Mutually Exclusive and Co-occurring gene sets
system.time(expr = laml.si <- maftools::somaticInteractions(maf = laml, pvalue = c(0.05, 0.1), findPathways = TRUE, returnAll = TRUE))
write.table(x = laml.si$pairs, file = "Supplemental_Table_S6.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
write.table(x = laml.si$gene_sets, file = "Supplemental_Table_S7.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
maftools::oncoplot(maf = laml, genes = c("NPM1", "RUNX1", "TP53"), drawRowBar = FALSE, drawColBar = FALSE)
Oncodrive to identify cancer genes
system.time(expr = laml.od <- maftools::oncodrive(maf = laml))
## Estimating background scores from synonymous variants..
## Assuming protein change information are stored under column Protein_Change. Use argument AACol to override if necessary.
## Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)
## Estimating cluster scores from non-syn variants..
## Assuming protein change information are stored under column Protein_Change. Use argument AACol to override if necessary.
##
|
| | 0%
|
|=== | 4%
|
|====== | 9%
|
|======== | 13%
|
|=========== | 17%
|
|============== | 22%
|
|================= | 26%
|
|==================== | 30%
|
|======================= | 35%
|
|========================= | 39%
|
|============================ | 43%
|
|=============================== | 48%
|
|================================== | 52%
|
|===================================== | 57%
|
|======================================== | 61%
|
|========================================== | 65%
|
|============================================= | 70%
|
|================================================ | 74%
|
|=================================================== | 78%
|
|====================================================== | 83%
|
|========================================================= | 87%
|
|=========================================================== | 91%
|
|============================================================== | 96%
|
|=================================================================| 100%
## Comapring with background model and estimating p-values..
## Done !
## user system elapsed
## 0.628 0.064 0.698
maftools::plotOncodrive(res = laml.od, useFraction = TRUE, fdrCutOff = 0.1)
Pan-Cancer comparison
maftools::pancanComparison(mutsigResults = "ESCA_MutSigCV/ESCA-TP.sig_genes.txt.gz", qval = 0.1, cohortName = "ESCA", normSampleSize = TRUE, inputSampleSize = 185, genesToLabel = c('ARID1A', 'TP53', 'MLL2', 'NFE2L2', 'ERBB2', 'CDKN2A', 'SAMD4', 'ZNF750', 'TGFBR2'), labelSize = 3)
## Significantly mutated genes in ESCA (q < 0.1): 16
## Significantly mutated genes in PanCan cohort (q <0.1): 114
## Significantly mutated genes exclusive to ESCA (q < 0.1):
## gene pancan q nMut
## 1: CCDC7 1.000 9.964024e-02 10
## 2: COL6A5 1.000 2.448691e-03 18
## 3: DCDC1 1.000 4.614189e-05 16
## 4: FLG 1.000 1.436423e-02 32
## 5: IVL 1.000 1.229658e-05 15
## 6: TGFBR2 0.329 6.424124e-05 15
## 7: ZNF750 0.549 1.436423e-02 10
#Fix missing information in clinical data
cd = maftools::getClinicalData(laml)
colnames(x = cd)[37] = 'FAB_Morphology'
cd$FAB_Morphology = ifelse(test = as.character(cd$FAB_Morphology) == 'Not_Classified', yes = NA, no = as.character(cd$FAB_Morphology))
#classifiy samples as above_60 & below_60
cd$Age_group = ifelse(test = as.numeric(as.character(cd$age_at_initial_pathologic_diagnosis)) > 60, yes = "above_60", no = "below_60")
#Run enrichment analysis
laml.Age_group.ce = maftools::clinicalEnrichment(maf = laml, clinicalFeature = 'Age_group', annotationDat = cd)
## Sample size per factor in Age_group:
##
## above_60 below_60
## 81 111
laml.fab.ce = maftools::clinicalEnrichment(maf = laml, clinicalFeature = 'FAB_Morphology', annotationDat = cd)
## Sample size per factor in FAB_Morphology:
##
## M0 M1 M2 M3 M4 M5 M6 M7
## 18 43 44 21 39 19 3 3
#Plot results
maftools::plotEnrichmentResults(enrich_res = laml.fab.ce)
maftools::plotEnrichmentResults(enrich_res = laml.Age_group.ce)
maftools::plotmafSummary(maf = esca)
ms.sig.genes = data.table::fread(input = 'zcat < ESCA_MutSigCV/ESCA-TP.sig_genes.txt.gz')[q < 0.01, gene]
x = maftools::plotVaf(maf = esca, genes = ms.sig.genes)
x = maftools::tcgaCompare(maf = escc, cohortName = "ESCC")
#These are fishy genes which are often mutated in exomes. Ignoring them.
flags = c("TTN", "MUC16", "OBSCN", "AHNAK2", "SYNE1", "FLG", "MUC5B",
"DNAH17", "PLEC", "DST", "SYNE2", "NEB", "HSPG2", "LAMA5", "AHNAK",
"HMCN1", "USH2A", "DNAH11", "MACF1", "MUC17", "HYDIN")
maftools::geneCloud(input = esca, minMut = 15, genesToIgnore = flags)
Read CNV data from GISTIC results
gistic.esca = maftools::readGistic(gisticAllLesionsFile = "ESCA_GISTIC/all_lesions.conf_99.txt", gisticAmpGenesFile = "ESCA_GISTIC/amp_genes.conf_99.txt", gisticDelGenesFile = "ESCA_GISTIC/del_genes.conf_99.txt", gisticScoresFile = "ESCA_GISTIC/scores.gistic", isTCGA = TRUE)
## Processing Gistic files..
## Processing amp_genes.conf_99.txt..
## Processing del_genes.conf_99.txt..
## Processing scores.gistic..
## Summarizing samples..
esca.histology = maftools::getClinicalData(x = esca)[,.(Tumor_Sample_Barcode, Histology)]
#pdf(file = "ESCA_gistic_op.pdf", width = 9, height = 10, paper = "special", bg = "white")
maftools::gisticOncoPlot(gistic = gistic.esca, top = 25, fontSize = 10, clinicalData = esca.histology, clinicalFeatures = 'Histology', sortByAnnotation = TRUE, annotationColor = hist.col)
#dev.off()
gistic.esca.bp = maftools::gisticBubblePlot(gistic = gistic.esca, markBands = c('9p21.3', '11q13.3'), fdrCutOff = 0.1)
gistic.esca.bp+ylim(-20, 20)
## Warning: Removed 21 rows containing missing values (geom_point).
#cowplot::save_plot(filename = "ESCA_gistic_bp.pdf", plot = gistic.esca.bp, base_height = 5, base_width = 6, bg = "white")
maftools::gisticChromPlot(gistic = gistic.esca, fdrCutOff = 0.1, markBands = c('9p21.3', '11q13.3'), cytobandOffset = 0.1)
## Warning: Ignoring unknown aesthetics: label
#Weights of known COSMIC signatures
sigs = data.table::fread(input = system.file('extdata', 'signatures.txt', package = 'maftools'), stringsAsFactors = FALSE, data.table = FALSE)
colnames(sigs) = gsub(pattern = ' ', replacement = '_', x = colnames(sigs))
rownames(sigs) = sigs$Somatic_Mutation_Type
sigs = sigs[,-c(1:3)]
#Sinature weights for ESCC
escc.sig.wts = escc.sig$signatures
#Order COSMIC signatures in same order as above signatures
sigs = sigs[rownames(escc.sig.wts),]
set.seed(seed = 1024) #Set seed for reproducibility
permutated_sigs_escc = lapply(1:10000, function(i){
w = escc.sig.wts[sample(x = rownames(escc.sig.wts), size = 96, replace = FALSE),]
coSineMat = c()
for(i in 1:ncol(w)){
sig = w[,i]
coSineMat = rbind(coSineMat, apply(sigs, 2, function(x){
#Estimate cosine similarity against all 30 signatures
round(crossprod(sig, x)/sqrt(crossprod(x) * crossprod(sig)), digits = 3)
}))
}
#Choose the best match
apply(coSineMat, 1, max, na.rm = TRUE)
})
permutated_sigs_escc = unlist(permutated_sigs_escc)
#Do the same for EAC
eac.sig.wts = eac.sig$signatures
sigs = sigs[rownames(eac.sig.wts),]
dim(eac.sig.wts)
## [1] 96 5
set.seed(seed = 1024)
permutated_sigs_eac = lapply(1:10000, function(i){
w = eac.sig.wts[sample(x = rownames(eac.sig.wts), size = 96, replace = FALSE),]
coSineMat = c()
for(i in 1:ncol(w)){
sig = w[,i]
coSineMat = rbind(coSineMat, apply(sigs, 2, function(x){
#Estimate cosine similarity against all 30 signatures
round(crossprod(sig, x)/sqrt(crossprod(x) * crossprod(sig)), digits = 3)
}))
}
#Choose the best match
apply(coSineMat, 1, max, na.rm = TRUE)
})
permutated_sigs_eac = unlist(x = permutated_sigs_eac)
#Plot histogram
hist(permutated_sigs_escc, breaks = 25, xlim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, main = NA)
axis(side = 1, at = seq(0, 1, 0.2), lty = 1, lwd = 2, font = 2)
mtext(text = "Cosine-similarity", side = 1, font = 2, line = 2)
axis(side = 2, at = seq(0, 4e3, 500), lty = 1, lwd = 2, las = 2, font = 2)
mtext(text = "# trials", side = 2, font = 2, line = 3)
title(main = "ESCC", adj = 0)
abline(v = 0.8, col = "red", lty = 2)
text(x = 0.9, y = 500, labels = paste0("N: ", length(permutated_sigs_escc[permutated_sigs_escc > 0.8])), font = 2)
hist(permutated_sigs_eac, breaks = 50, xlim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, main = NA)
axis(side = 1, at = seq(0, 1, 0.2), lty = 1, lwd = 2, font = 2)
mtext(text = "Cosine-similarity", side = 1, font = 2, line = 2)
axis(side = 2, at = seq(0, 4e3, 500), lty = 1, lwd = 2, las = 2, font = 2)
mtext(text = "# trials", side = 2, font = 2, line = 3)
abline(v = 0.8, col = "red", lty = 2)
text(x = 0.9, y = 500, labels = paste0("N: ", length(permutated_sigs_eac[permutated_sigs_eac > 0.8])), font = 2)
title(main = "EAC", adj = 0)
system.time(expr = escc.sig.enrichment <- maftools::signatureEnrichment(maf = escc, sig_res = escc.sig))
## Running k-means for signature assignment..
## Performing pairwise and groupwise comparisions..
## Sample size per factor in Signature:
##
## Signature_1 Signature_2 Signature_3
## 18 56 22
## Estimating mutation load and signature exposures..
## user system elapsed
## 5.976 0.000 5.976
system.time(expr = eac.sig.enrichment <- maftools::signatureEnrichment(maf = eac, sig_res = eac.sig))
## Running k-means for signature assignment..
## Performing pairwise and groupwise comparisions..
## Sample size per factor in Signature:
##
## Signature_1 Signature_2 Signature_3 Signature_4 Signature_5
## 9 15 18 16 31
## Estimating mutation load and signature exposures..
## user system elapsed
## 21.800 0.072 21.924
Correlation between samples with APOBEC enrichment score > 2, and samples classified as APOBEC signature associated.
escc.apobec.enriched.samples = as.character(escc.tnm$APOBEC_scores[APOBEC_Enriched %in% 'yes', Tumor_Sample_Barcode])
escc.apobec.sig.samples = escc.sig.enrichment$Signature_Assignment[Signature %in% 'Signature_1', Tumor_Sample_Barcode]
apobec.olaps = Vennerable::Venn(list(APOBEC_enrichment = escc.apobec.enriched.samples,
APOBEC_signature = escc.apobec.sig.samples))
escc.apobec.sig.contrib = data.frame(APOBEC_score = escc.sig$contributions[1,])
data.table::setDT(x = escc.apobec.sig.contrib, keep.rownames = T)
escc.apobec.scores = escc.tnm$APOBEC_scores
apobec.sig.comp = merge(escc.apobec.scores, escc.apobec.sig.contrib, by.x = "Tumor_Sample_Barcode", by.y = "rn")[,.(Tumor_Sample_Barcode, APOBEC_Enrichment, APOBEC_score)]
apobec.sig.comp$enriched = ifelse(test = apobec.sig.comp$APOBEC_Enrichment >= 2, yes = "maroon", no = "gray70")
apobec.sig.comp$shape = ifelse(test = apobec.sig.comp$Tumor_Sample_Barcode %in% apobec.olaps@IntersectionSets$`10`, yes = 4, no = 19)
apobec.sig.cor = cor.test(x = apobec.sig.comp$APOBEC_score, apobec.sig.comp$APOBEC_Enrichment, method = "spearman")
par(mar = c(4, 4, 2, 2))
plot(apobec.sig.comp$APOBEC_Enrichment, apobec.sig.comp$APOBEC_score, pch = apobec.sig.comp$shape, col = apobec.sig.comp$enriched, axes = FALSE, xlab = NA, ylab = NA, xlim = c(0, 4), ylim = c(0, 1.3))
axis(side = 2, at = seq(0, 1, 0.25), lwd = 2, las = 2, font = 2)
axis(side = 1, at = seq(0, 4, 1), lwd = 2, font = 2)
abline(lm(apobec.sig.comp$APOBEC_score ~ apobec.sig.comp$APOBEC_Enrichment), col = "black", lwd = 2)
segments(x0 = 2, y0 = 0, x1 = 2, y1 = 1, lty = 2, lwd = 2)
text(x = 1, y = 0.9, adj = 1, labels = paste0("r: ", round(apobec.sig.cor$estimate, 3)), font = 2)
mtext(text = "APOBEC enrichment socre", side = 1, line = 2.5, font = 2)
mtext(text = "Signature exposure", side = 2, line = 3, font = 2)
legend(x = 0.5, y = 1.3, legend = "trinucleotideMatrix", bty = "n", col = "maroon", pch = 19, text.font = 4)
legend(x = 0.5, y = 1.2, legend = "missed by signatureEnrichment", bty = "n", col = "maroon", pch = 4, cex = 1, text.font = 4)
maftools::plotEnrichmentResults(enrich_res = escc.sig.enrichment, pVal = 0.01)
Lollipop plots for VGSC associated genes
scn5a.esca = maftools::lollipopPlot(maf = esca, gene = "SCN5A", defaultYaxis = FALSE)
## HGNC refseq.ID protein.ID aa.length
## 1: SCN5A NM_000335 NP_000326 2015
## 2: SCN5A NM_198056 NP_932173 2016
## 3: SCN5A NM_001160161 NP_001153633 1962
## 4: SCN5A NM_001160160 NP_001153632 1983
## 5: SCN5A NM_001099405 NP_001092875 1998
## 6: SCN5A NM_001099404 NP_001092874 2016
scn3a.esca = maftools::lollipopPlot(maf = esca, gene = "SCN3A", defaultYaxis = FALSE)
## HGNC refseq.ID protein.ID aa.length
## 1: SCN3A NM_001081676 NP_001075145 1951
## 2: SCN3A NM_006922 NP_008853 2000
## 3: SCN3A NM_001081677 NP_001075146 1951
scn9a.esca = maftools::lollipopPlot(maf = esca, gene = "SCN9A", defaultYaxis = FALSE)
VGSC.genes.lp = cowplot::plot_grid(scn5a.esca, scn3a.esca, scn9a.esca, nrow = 3, ncol = 1)
print(VGSC.genes.lp)
Compare MutSigCV and Oncodrive results
ms.res = data.table::fread(input = "zcat < LAML_MuSigCV/LAML-TB.sig_genes.txt.gz")
sig.genes = unique(c(as.character(laml.od[fdr < 0.1, Hugo_Symbol]), ms.res[q < 0.1, gene]))
sig.genes = merge(ms.res[gene %in% sig.genes, .(gene, q)], laml.od[Hugo_Symbol %in% sig.genes, .(Hugo_Symbol, fdr)], by.x = 'gene', by.y = 'Hugo_Symbol', all.x = TRUE)
colnames(sig.genes)[2:3] = c("OncodriveCLUST", "MutSigCV")
data.table::setDF(sig.genes)
rownames(sig.genes) = sig.genes$gene
sig.genes = sig.genes[,-1]
sig.genes$OncodriveCLUST = ifelse(test = sig.genes$OncodriveCLUST == 0, yes = .Machine$double.eps, no = sig.genes$OncodriveCLUST)
ms.smgs = as.character(ms.res[q < 0.1, gene])
od.smgs = as.character(laml.od[fdr < 0.1, Hugo_Symbol])
venn.ip = list(MutSigCV = ms.smgs, OncodriveCLUST = od.smgs)
venn.cols = RColorBrewer::brewer.pal(n = 3, name = "Spectral")[1:2]
venn.plot = VennDiagram::venn.diagram(x = venn.ip, filename = NULL,
col = venn.cols, fill = venn.cols,
alpha = 0.6, cex = 3, cat.cex = 0)
grid::grid.draw(x = venn.plot)
Oncodrive specific genes
od.specifc = c("KIT", "KRAS")
for(i in 1:length(od.specifc)){
x = lollipopPlot(maf = laml, gene = od.specifc[i], labelPos = 'all', defaultYaxis = FALSE, cBioPortal = TRUE)
print(x)
}
MutSigCV specific genes
ms.specific = c("CEBPA", "TET2", "TP53", "PHF6", "PTPN11", "STAG2", "RAD21")
for(i in 1:length(ms.specific)){
x = lollipopPlot(maf = laml, gene = ms.specific[i], defaultYaxis = FALSE, cBioPortal = TRUE)
print(x)
}
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=de_DE.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] doParallel_1.0.11 iterators_1.0.9 foreach_1.4.4
## [4] ggplot2_3.0.0 Vennerable_3.1.0.9000 VennDiagram_1.6.20
## [7] futile.logger_1.4.3 NMF_0.21.0 cluster_2.0.7-1
## [10] rngtools_1.2.4 pkgmaker_0.22 registry_0.5
## [13] maftools_1.6.05 Biobase_2.38.0 BiocGenerics_0.24.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 matrixStats_0.53.1
## [3] bit64_0.9-7 RColorBrewer_1.1-2
## [5] progress_1.1.2 httr_1.3.1
## [7] rprojroot_1.3-2 GenomeInfoDb_1.14.0
## [9] tools_3.4.4 backports_1.1.2
## [11] R6_2.2.2 DBI_1.0.0
## [13] lazyeval_0.2.1 colorspace_1.3-2
## [15] GetoptLong_0.1.6 withr_2.1.2
## [17] gridExtra_2.3 tidyselect_0.2.4
## [19] prettyunits_1.0.2 RMySQL_0.10.8
## [21] bit_1.1-12 compiler_3.4.4
## [23] graph_1.56.0 formatR_1.5
## [25] DelayedArray_0.4.1 labeling_0.3
## [27] slam_0.1-43 rtracklayer_1.38.3
## [29] scales_1.0.0 RBGL_1.54.0
## [31] stringr_1.3.0 digest_0.6.15
## [33] Rsamtools_1.30.0 rmarkdown_1.9
## [35] cometExactTest_0.1.5 XVector_0.18.0
## [37] pkgconfig_2.0.1 htmltools_0.3.6
## [39] changepoint_2.2.2 BSgenome_1.46.0
## [41] rlang_0.2.1 GlobalOptions_0.0.13
## [43] RSQLite_2.1.0 shape_1.4.4
## [45] bindr_0.1.1 zoo_1.8-1
## [47] mclust_5.4.1 BiocParallel_1.12.0
## [49] dplyr_0.7.5 VariantAnnotation_1.24.5
## [51] RCurl_1.95-4.10 magrittr_1.5
## [53] GenomeInfoDbData_1.0.0 wordcloud_2.5
## [55] Matrix_1.2-14 Rcpp_0.12.18
## [57] munsell_0.5.0 S4Vectors_0.16.0
## [59] stringi_1.2.2 yaml_2.1.19
## [61] SummarizedExperiment_1.8.1 zlibbioc_1.24.0
## [63] plyr_1.8.4 blob_1.1.1
## [65] ggrepel_0.8.0 lattice_0.20-35
## [67] cowplot_0.9.3 Biostrings_2.46.0
## [69] splines_3.4.4 GenomicFeatures_1.30.3
## [71] circlize_0.4.3 knitr_1.20
## [73] ComplexHeatmap_1.17.1 pillar_1.2.2
## [75] GenomicRanges_1.30.3 rjson_0.2.20
## [77] reshape2_1.4.3 codetools_0.2-15
## [79] biomaRt_2.34.2 stats4_3.4.4
## [81] futile.options_1.0.1 XML_3.98-1.11
## [83] glue_1.2.0 evaluate_0.10.1
## [85] lambda.r_1.2.2 data.table_1.11.4
## [87] gtable_0.2.0 purrr_0.2.4
## [89] assertthat_0.2.0 gridBase_0.4-7
## [91] xtable_1.8-2 survival_2.42-3
## [93] tibble_1.4.2 GenomicAlignments_1.14.2
## [95] AnnotationDbi_1.40.0 memoise_1.1.0
## [97] IRanges_2.12.0 bindrcpp_0.2.2