This document contains supplemental R code for reproducing results from Maftools: Efficient and comprehensive analysis of somatic variants in cancer

Functions from maftools are called with the namespace maftools::. See SessionInfo at the end of the document for complete package details.

Install maftools

#Install maftools from Bioconductor
source("https://bioconductor.org/biocLite.R")
biocLite("maftools")

#For specific version of maftools used in the manuscript (v1.6.05), install from frozen GitHub pub branch
# library(devtools)
# devtools::install_github(repo = "PoisonAlien/maftools", ref = "pub")

Load required libraries

library(maftools)
library(NMF)
library(VennDiagram)
library(Vennerable)
library(ggplot2)

Read MAF files

#ESCA
system.time(expr = esca <- maftools::read.maf(maf = "TCGA_ESCA.maf.gz", clinicalData = "TCGA_ESCA_clinical.tsv", verbose = FALSE))
## reading maf..
## NOTE: Non MAF specific values in Variant_Classification column:
## [1] "Start_Codon_Del" "Stop_Codon_Ins"
## Done !
##    user  system elapsed 
##   1.924   0.064   1.998
#Seperate ESCA into EAC or ESCC cohorts based on histlogy
esca.clinical = maftools::getClinicalData(x = esca)
esca.sq.tsbs = esca.clinical[primary_pathology_histological_type %in% "Esophagus_Squamous_Cell_Carcinoma", Tumor_Sample_Barcode]
esca.ad.tsbs = esca.clinical[!primary_pathology_histological_type %in% "Esophagus_Squamous_Cell_Carcinoma", Tumor_Sample_Barcode]

escc = maftools::subsetMaf(maf = esca, tsb = esca.sq.tsbs, mafObj = TRUE)
eac = maftools::subsetMaf(maf = esca, tsb = esca.ad.tsbs, mafObj = TRUE)

#BRCA WGS MAF 
system.time(expr = brca <- maftools::read.maf(maf = "TCGA_BRCA_WGS.maf.gz", verbose = FALSE))
## reading maf..
## Done !
##    user  system elapsed 
##   2.396   0.104   2.508
#LAML MAF
system.time(expr = laml <- maftools::read.maf(maf = "TCGA_LAML.maf.gz", clinicalData = "TCGA_LAML_clinical.tsv", verbose = FALSE))
## reading maf..
## Done !
##    user  system elapsed 
##   0.200   0.036   0.243

Figure 2

A.

hist.col = list(Histology = c('EAC' = '#FB8072', 'ESCC' = '#80B1D3'))
#pdf(file = "esca_oncoplot.pdf", width = 10, height = 7, paper = "special", bg = "white")
maftools::oncoplot(maf = esca, clinicalFeatures = "Histology", mutsig = 'ESCA_MutSigCV/ESCA-TP.sig_genes.txt.gz', sortByAnnotation = TRUE, mutsigQval = 0.1, annotationColor = hist.col, fontSize = 10)

#dev.off()

B.

esca.titv = maftools::titv(maf = esca, plot = FALSE)
plotTiTv(res = esca.titv)

## NULL

C.

maftools::lollipopPlot(maf = esca, gene = "TP53", labelPos = c(248, 273, 175))

D.

For representative sample in figure 1D

maftools::rainfallPlot(maf = brca, tsb = "TCGA-A8-A08B", detectChangePoints = TRUE)
## Processing TCGA-A8-A08B..
## Change points detected at:
##    Chromosome Start_Position End_Position nMuts Avg_intermutation_dist
## 1:          8       98000822     98456466    33               14238.88
## 2:          8      124090377    124096810    22                 306.33
## 3:          8      136085710    140768420    16              312180.67
## 4:         17       28925504     34484595    25              231628.79
## 5:         17       38619251     40070193    13              120911.83
## 6:         23      108609210    110760883     6              430334.60
##       Size C>G C>T C>A T>A T>C T>G Filter
## 1:  455644  11  22  NA  NA  NA  NA      .
## 2:    6433   1  21  NA  NA  NA  NA   PASS
## 3: 4682710   2  10   4  NA  NA  NA      .
## 4: 5559091   7  12   2   1   2   1      .
## 5: 1450942   3   9   1  NA  NA  NA      .
## 6: 2151673   1   2   1  NA   1   1      .

Run for all samples and prepare Supplemental_Table_SS1

#Get all sample IDs 
brca.tsbs = as.character(maftools::getSampleSummary(x = brca)[,Tumor_Sample_Barcode])
#Apply rainfallPlot function on all samples
brca.kats = lapply(brca.tsbs, function(x) {maftools::rainfallPlot(maf = brca, detectChangePoints = TRUE, tsb = x)})

#Gather all output files generated from above command 
brca.cps = list.files(pattern = "_changePoints")
#Read and merge into single table
brca.kats = data.table::rbindlist(lapply(list.files(pattern = "_changePoints"), data.table::fread), fill = TRUE)

write.table(brca.kats, file = "Supplemental_Table_S1.tsv", sep = '\t', quote = F, row.names = F)

Figure 3

Signature analysis

hg19.fa = "/bigdisk/ref_db/hg19/fa/hg19.fa" #set path to hg19 reference genome

#For ESCC: extract adjacent bases and signatures
system.time(expr = escc.tnm <- maftools::trinucleotideMatrix(maf = escc, ref_genome = hg19.fa, prefix = "chr", add = TRUE))
## reading /bigdisk/ref_db/hg19/fa/hg19.fa (this might take few minutes)..
## Warning in maftools::trinucleotideMatrix(maf = escc, ref_genome =
## hg19.fa, : Chromosome names in MAF must match chromosome names in reference
## fasta. Ignorinig 554 single nucleotide variants from missing chromosomes
## chr23, chr24
## Extracting 5' and 3' adjacent bases..
## Extracting +/- 20bp around mutated bases for background C>T estimation..
## Estimating APOBEC enrichment scores..
## Performing one-way Fisher's test for APOBEC enrichment..
## APOBEC related mutations are enriched in 26.042% of samples (APOBEC enrichment score > 2 ; 25 of 96 samples)
## Creating mutation matrix..
## matrix of dimension 96x96
##    user  system elapsed 
##  17.256   2.252  29.619
system.time(expr = escc.sig <- maftools::extractSignatures(mat = escc.tnm, parallel = "P3", nTry = 8))
## Estimating best rank..
##    method   seed rng metric rank sparseness.basis sparseness.coef      rss
## 1: brunet random   1     KL    2        0.5306021       0.5208087 39669.08
## 2: brunet random   6     KL    3        0.5680904       0.3887575 26765.15
## 3: brunet random   5     KL    4        0.5496541       0.4065412 22801.67
## 4: brunet random   3     KL    5        0.5526710       0.4254033 20616.83
## 5: brunet random   6     KL    6        0.5737730       0.4517907 18766.57
## 6: brunet random   4     KL    7        0.5798829       0.4761835 17579.53
## 7: brunet random   2     KL    8        0.5932377       0.4835888 16403.01
##         evar silhouette.coef silhouette.basis residuals niter   cpu
## 1: 0.7629815       1.0000000        1.0000000  5616.754   470 0.124
## 2: 0.8400811       0.7618033        0.8200777  4868.709   450 0.148
## 3: 0.8637625       0.6053517        0.6598350  4583.411   700 0.248
## 4: 0.8768166       0.5372801        0.5747045  4349.541   660 0.260
## 5: 0.8878717       0.5101924        0.5328021  4156.612  1530 0.564
## 6: 0.8949642       0.4382449        0.5179072  3992.185  1260 0.564
## 7: 0.9019938       0.3959969        0.4617969  3842.871  2000 1.004
##    cpu.all nrun cophenetic dispersion silhouette.consensus
## 1:   4.816   10  0.9995303  0.9851736            0.9955865
## 2:   5.316   10  0.9869938  0.9260590            0.9479500
## 3:   5.568   10  0.9865768  0.7890712            0.8956212
## 4:   7.124   10  0.9633915  0.8023177            0.7803073
## 5:   7.616   10  0.9618033  0.8063108            0.7481140
## 6:   8.148   10  0.9349485  0.7958073            0.6424952
## 7:   9.244   10  0.9314505  0.8155295            0.5845807
## Using 3 as a best-fit rank based on decreasing cophenetic correlation coefficient.
## Comparing against experimentally validated 30 signatures.. (See http://cancer.sanger.ac.uk/cosmic/signatures for details.)
## Found Signature_1 most similar to validated Signature_13. Aetiology: APOBEC Cytidine Deaminase (C>G) [cosine-similarity: 0.838]
## Found Signature_2 most similar to validated Signature_6. Aetiology: defective DNA mismatch repair [cosine-similarity: 0.929]
## Found Signature_3 most similar to validated Signature_4. Aetiology: exposure to tobacco (smoking) mutagens [cosine-similarity: 0.881]

##    user  system elapsed 
##  74.080   8.812  39.370
#For ESCA
system.time(eac.tnm <- maftools::trinucleotideMatrix(maf = eac, ref_genome = hg19.fa, prefix = "chr", add = TRUE))
## reading /bigdisk/ref_db/hg19/fa/hg19.fa (this might take few minutes)..
## Warning in maftools::trinucleotideMatrix(maf = eac, ref_genome = hg19.fa, :
## Chromosome names in MAF must match chromosome names in reference fasta.
## Ignorinig 840 single nucleotide variants from missing chromosomes chr23,
## chr24
## Extracting 5' and 3' adjacent bases..
## Extracting +/- 20bp around mutated bases for background C>T estimation..
## Estimating APOBEC enrichment scores..
## Performing one-way Fisher's test for APOBEC enrichment..
## APOBEC related mutations are enriched in 0% of samples (APOBEC enrichment score > 2 ; 0 of 89 samples)
## Creating mutation matrix..
## matrix of dimension 89x96
##    user  system elapsed 
##  10.636   1.192  11.899
system.time(eac.sig <- maftools::extractSignatures(mat = eac.tnm, parallel = "P3", nTry = 8))
## Estimating best rank..
##    method   seed rng metric rank sparseness.basis sparseness.coef      rss
## 1: brunet random   4     KL    2        0.5009027       0.3026437 90879.21
## 2: brunet random   5     KL    3        0.5554445       0.3172066 36610.05
## 3: brunet random   5     KL    4        0.5701976       0.3405838 28935.22
## 4: brunet random   3     KL    5        0.5773712       0.4119355 25128.97
## 5: brunet random   1     KL    6        0.5948495       0.4140505 21480.11
## 6: brunet random   2     KL    7        0.5944406       0.4339903 20042.54
## 7: brunet random   5     KL    8        0.5829818       0.4561130 19561.32
##         evar silhouette.coef silhouette.basis residuals niter   cpu
## 1: 0.8452432       1.0000000        1.0000000  5943.593   770 0.196
## 2: 0.9376573       0.7506887        0.7959748  4864.049   640 0.176
## 3: 0.9507267       0.6018015        0.7691925  4335.320  1230 0.368
## 4: 0.9572083       0.5199186        0.6432341  4085.009  1440 0.484
## 5: 0.9634219       0.4715414        0.5706040  3890.788   800 0.312
## 6: 0.9658699       0.4173380        0.4980237  3756.538  1120 0.472
## 7: 0.9666893       0.3578377        0.4540708  3632.213  1520 1.136
##    cpu.all nrun cophenetic dispersion silhouette.consensus
## 1:   4.100   10  0.8900892  0.3489509            0.6171745
## 2:   4.208   10  0.9690168  0.8383537            0.8795046
## 3:   4.984   10  0.9795232  0.8794900            0.8935544
## 4:   5.668   10  0.9777927  0.8843479            0.8726386
## 5:   5.932   10  0.9353045  0.7715238            0.6979691
## 6:   6.872   10  0.9365607  0.7826537            0.6196948
## 7:   9.544   10  0.8824548  0.7445272            0.4442875
## Using 5 as a best-fit rank based on decreasing cophenetic correlation coefficient.
## Comparing against experimentally validated 30 signatures.. (See http://cancer.sanger.ac.uk/cosmic/signatures for details.)
## Found Signature_1 most similar to validated Signature_3. Aetiology: defects in DNA-DSB repair by HR [cosine-similarity: 0.792]
## Found Signature_2 most similar to validated Signature_17. Aetiology: Unknown [cosine-similarity: 0.979]
## Found Signature_3 most similar to validated Signature_6. Aetiology: defective DNA mismatch repair [cosine-similarity: 0.952]
## Found Signature_4 most similar to validated Signature_29. Aetiology: exposure to tobacco (chewing) mutagens [cosine-similarity: 0.85]
## Found Signature_5 most similar to validated Signature_1. Aetiology: spontaneous deamination of 5-methylcytosine [cosine-similarity: 0.932]

##    user  system elapsed 
##  69.804   9.308  35.319
#Write APOBEC enrichment results
write.table(x = escc.tnm$APOBEC_scores, file = "Supplemental_Table_S2.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

A. and B.

Plot signatures

maftools::plotSignatures(nmfRes = escc.sig)

maftools::plotSignatures(nmfRes = eac.sig)

C.

Plot differences between APOBEC enriched and non APOBEC enriched samples.

escc.apobec.diff = maftools::plotApobecDiff(tnm = escc.tnm, maf = escc, title_size = 1.1)

D. and E.

Perform signature enrichment analysis

system.time(expr = escc.sig.enrichment <- maftools::signatureEnrichment(maf = escc, sig_res = escc.sig))
## Running k-means for signature assignment..
## Performing pairwise and groupwise comparisions..
## Sample size per factor in Signature:
## 
## Signature_1 Signature_2 Signature_3 
##          18          56          22
## Estimating mutation load and signature exposures..
##    user  system elapsed 
##   5.968   0.004   6.011
system.time(expr = eac.sig.enrichment <- maftools::signatureEnrichment(maf = eac, sig_res = eac.sig))
## Running k-means for signature assignment..
## Performing pairwise and groupwise comparisions..
## Sample size per factor in Signature:
## 
## Signature_1 Signature_2 Signature_3 Signature_4 Signature_5 
##           9          15          18          16          31
## Estimating mutation load and signature exposures..
##    user  system elapsed 
##  22.280   0.032  22.431
write.table(x = escc.sig.enrichment$groupwise_comparision, file = "Supplemental_Table_S3.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
escc.sig.genes = escc.sig.enrichment$groupwise_comparision[p_value < 0.01, Hugo_Symbol]
#Get all samples associated with Signature 2
escc.sig2.tsbs = escc.sig.enrichment$Signature_Assignment[Signature %in% 'Signature_2', Tumor_Sample_Barcode]
#Rest of the samples
rest.tsbs = escc.sig.enrichment$Signature_Assignment[!Tumor_Sample_Barcode %in% escc.sig2.tsbs, Tumor_Sample_Barcode]

#Generate MAF object for above samples
escc.sig2 = maftools::subsetMaf(maf = escc, tsb = escc.sig2.tsbs, mafObj = TRUE)
escc.res = maftools::subsetMaf(maf = escc, tsb = rest.tsbs, mafObj = TRUE)

#Draw side-by-side oncoplot for TGFBR2 samples
maftools::coOncoplot(m1 = escc.sig2, m2 = escc.res, genes = c("TGFBR2", "ZNF292"), m1Name = "Signture_6 like", m2Name = "Rest", removeNonMutated = FALSE, geneNamefont = 16)

Note: Above plot has been cropped in the manuscript to show only TGFBR2 mutations.

#TGFBR2 mutations in ESCC domain
maftools::lollipopPlot(maf = escc, gene = "TGFBR2", labelPos = 'all', cBioPortal = TRUE)

##      HGNC    refseq.ID   protein.ID aa.length
## 1: TGFBR2 NM_001024847 NP_001020018       592
## 2: TGFBR2    NM_003242    NP_003233       567

Figure 4

A.

Compare ESCC and EAC to identify differentially mutated genes

system.time(expr = escc.vs.eac <- maftools::mafCompare(m1 = escc, m2 = eac, m1Name = "ESCC", m2Name = "EAC", minMut = 5))
##    user  system elapsed 
##   0.692   0.000   0.699
#All significantly differentially mutated genes (P-Value < 0.01)
maftools::forestPlot(escc.vs.eac, pVal = 0.01, color = as.character(unlist(hist.col)), geneFontSize = 0.7)

write.table(x = escc.vs.eac$results, file = "Supplemental_Table_S4.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

B.

VGSC.genes = c('SCN5A', 'SCN3A', 'SCN9A')
erbb.sig.path = c('ERBB2', 'ERBB4', 'TP53')

maftools::coOncoplot(m1 = eac, m2 = escc, genes = erbb.sig.path, m1Name = "EAC", m2Name = "ESCC", keepGeneOrder = TRUE, removeNonMutated = FALSE)

maftools::coOncoplot(m1 = eac, m2 = escc, genes = VGSC.genes, m1Name = "EAC", m2Name = "ESCC", keepGeneOrder = TRUE, removeNonMutated = FALSE)

C. and D.

Pfam domain enrichment

eac.pfam = maftools::pfamDomains(maf = eac, top = 10)
## Assuming protein change information are stored under column Protein_Change. Use argument AACol to override if necessary.
## Removed 425 mutations for which AA position was not available

escc.pfam = maftools::pfamDomains(maf = escc, top = 10)
## Assuming protein change information are stored under column Protein_Change. Use argument AACol to override if necessary.
## Removed 350 mutations for which AA position was not available

escc.eac.top10.domains = data.table::rbindlist(list(ESCC = escc.pfam$domainSummary[1:10], EAC = eac.pfam$domainSummary[1:10]), idcol = "Disease")

write.table(x = escc.eac.top10.domains, file = "Supplemental_Table_S5.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

Figure 5

A.

Somatic interactions - Mutually Exclusive and Co-occurring gene sets

system.time(expr = laml.si <- maftools::somaticInteractions(maf = laml, pvalue = c(0.05, 0.1), findPathways = TRUE, returnAll = TRUE))

write.table(x = laml.si$pairs, file = "Supplemental_Table_S6.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

write.table(x = laml.si$gene_sets, file = "Supplemental_Table_S7.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

B.

maftools::oncoplot(maf = laml, genes = c("NPM1", "RUNX1", "TP53"), drawRowBar = FALSE, drawColBar = FALSE)

C.

Oncodrive to identify cancer genes

system.time(expr = laml.od <- maftools::oncodrive(maf = laml))
## Estimating background scores from synonymous variants..
## Assuming protein change information are stored under column Protein_Change. Use argument AACol to override if necessary.
## Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)
## Estimating cluster scores from non-syn variants..
## Assuming protein change information are stored under column Protein_Change. Use argument AACol to override if necessary.
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |=================================================================| 100%
## Comapring with background model and estimating p-values..
## Done !
##    user  system elapsed 
##   0.628   0.064   0.698
maftools::plotOncodrive(res = laml.od, useFraction = TRUE, fdrCutOff = 0.1)

D.

Pan-Cancer comparison

maftools::pancanComparison(mutsigResults = "ESCA_MutSigCV/ESCA-TP.sig_genes.txt.gz", qval = 0.1, cohortName = "ESCA", normSampleSize = TRUE, inputSampleSize = 185, genesToLabel = c('ARID1A', 'TP53', 'MLL2', 'NFE2L2', 'ERBB2', 'CDKN2A', 'SAMD4', 'ZNF750', 'TGFBR2'), labelSize = 3)
## Significantly mutated genes in ESCA (q < 0.1): 16
## Significantly mutated genes in PanCan cohort (q <0.1): 114
## Significantly mutated genes exclusive to ESCA (q < 0.1):
##      gene pancan            q nMut
## 1:  CCDC7  1.000 9.964024e-02   10
## 2: COL6A5  1.000 2.448691e-03   18
## 3:  DCDC1  1.000 4.614189e-05   16
## 4:    FLG  1.000 1.436423e-02   32
## 5:    IVL  1.000 1.229658e-05   15
## 6: TGFBR2  0.329 6.424124e-05   15
## 7: ZNF750  0.549 1.436423e-02   10

E. and F.

#Fix missing information in clinical data
cd = maftools::getClinicalData(laml)
colnames(x = cd)[37] = 'FAB_Morphology'
cd$FAB_Morphology = ifelse(test = as.character(cd$FAB_Morphology) == 'Not_Classified', yes = NA, no = as.character(cd$FAB_Morphology))
#classifiy samples as above_60 & below_60
cd$Age_group = ifelse(test = as.numeric(as.character(cd$age_at_initial_pathologic_diagnosis)) > 60, yes = "above_60", no = "below_60")

#Run enrichment analysis
laml.Age_group.ce = maftools::clinicalEnrichment(maf = laml, clinicalFeature = 'Age_group', annotationDat = cd)
## Sample size per factor in Age_group:
## 
## above_60 below_60 
##       81      111
laml.fab.ce = maftools::clinicalEnrichment(maf = laml, clinicalFeature = 'FAB_Morphology', annotationDat = cd)
## Sample size per factor in FAB_Morphology:
## 
## M0 M1 M2 M3 M4 M5 M6 M7 
## 18 43 44 21 39 19  3  3
#Plot results
maftools::plotEnrichmentResults(enrich_res = laml.fab.ce)

maftools::plotEnrichmentResults(enrich_res = laml.Age_group.ce)

Supplemental Figure S1

A

maftools::plotmafSummary(maf = esca)

B

ms.sig.genes = data.table::fread(input = 'zcat < ESCA_MutSigCV/ESCA-TP.sig_genes.txt.gz')[q < 0.01, gene]
x = maftools::plotVaf(maf = esca, genes = ms.sig.genes)

C

x = maftools::tcgaCompare(maf = escc, cohortName = "ESCC")

D

#These are fishy genes which are often mutated in exomes. Ignoring them.
flags = c("TTN", "MUC16", "OBSCN", "AHNAK2", "SYNE1", "FLG", "MUC5B",
            "DNAH17", "PLEC", "DST", "SYNE2", "NEB", "HSPG2", "LAMA5", "AHNAK",
            "HMCN1", "USH2A", "DNAH11", "MACF1", "MUC17", "HYDIN")

maftools::geneCloud(input = esca, minMut = 15, genesToIgnore = flags)

Supplemental Figure S2

Read CNV data from GISTIC results

gistic.esca = maftools::readGistic(gisticAllLesionsFile = "ESCA_GISTIC/all_lesions.conf_99.txt", gisticAmpGenesFile = "ESCA_GISTIC/amp_genes.conf_99.txt", gisticDelGenesFile = "ESCA_GISTIC/del_genes.conf_99.txt", gisticScoresFile = "ESCA_GISTIC/scores.gistic", isTCGA = TRUE)
## Processing Gistic files..
## Processing amp_genes.conf_99.txt..
## Processing del_genes.conf_99.txt..
## Processing scores.gistic..
## Summarizing samples..

A.

esca.histology = maftools::getClinicalData(x = esca)[,.(Tumor_Sample_Barcode, Histology)]
#pdf(file = "ESCA_gistic_op.pdf", width = 9, height = 10, paper = "special", bg = "white")
maftools::gisticOncoPlot(gistic = gistic.esca, top = 25, fontSize = 10, clinicalData = esca.histology, clinicalFeatures = 'Histology', sortByAnnotation = TRUE, annotationColor = hist.col)

#dev.off()

B.

gistic.esca.bp = maftools::gisticBubblePlot(gistic = gistic.esca, markBands = c('9p21.3', '11q13.3'), fdrCutOff = 0.1)
gistic.esca.bp+ylim(-20, 20)
## Warning: Removed 21 rows containing missing values (geom_point).

#cowplot::save_plot(filename = "ESCA_gistic_bp.pdf", plot = gistic.esca.bp, base_height = 5, base_width = 6, bg = "white")

C.

maftools::gisticChromPlot(gistic = gistic.esca, fdrCutOff = 0.1, markBands = c('9p21.3', '11q13.3'), cytobandOffset = 0.1)
## Warning: Ignoring unknown aesthetics: label

Supplemental figure S3

C and D

#Weights of known COSMIC signatures
sigs = data.table::fread(input = system.file('extdata', 'signatures.txt', package = 'maftools'), stringsAsFactors = FALSE, data.table = FALSE)
colnames(sigs) = gsub(pattern = ' ', replacement = '_', x = colnames(sigs))
rownames(sigs) = sigs$Somatic_Mutation_Type
sigs = sigs[,-c(1:3)]

#Sinature weights for ESCC
escc.sig.wts = escc.sig$signatures
#Order COSMIC signatures in same order as above signatures
sigs = sigs[rownames(escc.sig.wts),]

set.seed(seed = 1024) #Set seed for reproducibility
permutated_sigs_escc = lapply(1:10000, function(i){
                      w = escc.sig.wts[sample(x = rownames(escc.sig.wts), size = 96, replace = FALSE),]
                      coSineMat = c()
                      for(i in 1:ncol(w)){
                          sig = w[,i]
                          coSineMat = rbind(coSineMat, apply(sigs, 2, function(x){
                            #Estimate cosine similarity against all 30 signatures
                            round(crossprod(sig, x)/sqrt(crossprod(x) * crossprod(sig)), digits = 3) 
                          }))
                      }
                      #Choose the best match
                      apply(coSineMat, 1, max, na.rm = TRUE)
                      })

permutated_sigs_escc = unlist(permutated_sigs_escc)

#Do the same for EAC 
eac.sig.wts = eac.sig$signatures
sigs = sigs[rownames(eac.sig.wts),]

dim(eac.sig.wts)
## [1] 96  5
set.seed(seed = 1024)
permutated_sigs_eac = lapply(1:10000, function(i){
                      w = eac.sig.wts[sample(x = rownames(eac.sig.wts), size = 96, replace = FALSE),]
                      coSineMat = c()
                      for(i in 1:ncol(w)){
                          sig = w[,i]
                          coSineMat = rbind(coSineMat, apply(sigs, 2, function(x){
                            #Estimate cosine similarity against all 30 signatures
                            round(crossprod(sig, x)/sqrt(crossprod(x) * crossprod(sig)), digits = 3) 
                          }))
                      }
                      #Choose the best match
                      apply(coSineMat, 1, max, na.rm = TRUE)
                      })


permutated_sigs_eac = unlist(x = permutated_sigs_eac)
#Plot histogram
hist(permutated_sigs_escc, breaks = 25, xlim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, main = NA)
axis(side = 1, at = seq(0, 1, 0.2), lty = 1, lwd = 2, font = 2)
mtext(text = "Cosine-similarity", side = 1, font = 2, line = 2)
axis(side = 2, at = seq(0, 4e3, 500), lty = 1, lwd = 2, las = 2, font = 2)
mtext(text = "# trials", side = 2, font = 2, line = 3)
title(main = "ESCC", adj = 0)
abline(v = 0.8, col = "red", lty = 2)
text(x = 0.9, y = 500, labels = paste0("N: ", length(permutated_sigs_escc[permutated_sigs_escc > 0.8])), font = 2)

hist(permutated_sigs_eac, breaks = 50, xlim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA, main = NA)
axis(side = 1, at = seq(0, 1, 0.2), lty = 1, lwd = 2, font = 2)
mtext(text = "Cosine-similarity", side = 1, font = 2, line = 2)
axis(side = 2, at = seq(0, 4e3, 500), lty = 1, lwd = 2, las = 2, font = 2)
mtext(text = "# trials", side = 2, font = 2, line = 3)
abline(v = 0.8, col = "red", lty = 2)
text(x = 0.9, y = 500, labels = paste0("N: ", length(permutated_sigs_eac[permutated_sigs_eac > 0.8])), font = 2)
title(main = "EAC", adj = 0)

Supplemental Figure S4

A.

system.time(expr = escc.sig.enrichment <-  maftools::signatureEnrichment(maf = escc, sig_res = escc.sig))
## Running k-means for signature assignment..
## Performing pairwise and groupwise comparisions..
## Sample size per factor in Signature:
## 
## Signature_1 Signature_2 Signature_3 
##          18          56          22
## Estimating mutation load and signature exposures..

##    user  system elapsed 
##   5.976   0.000   5.976

B.

system.time(expr = eac.sig.enrichment <- maftools::signatureEnrichment(maf = eac, sig_res = eac.sig))
## Running k-means for signature assignment..
## Performing pairwise and groupwise comparisions..
## Sample size per factor in Signature:
## 
## Signature_1 Signature_2 Signature_3 Signature_4 Signature_5 
##           9          15          18          16          31
## Estimating mutation load and signature exposures..

##    user  system elapsed 
##  21.800   0.072  21.924

C

Correlation between samples with APOBEC enrichment score > 2, and samples classified as APOBEC signature associated.

escc.apobec.enriched.samples = as.character(escc.tnm$APOBEC_scores[APOBEC_Enriched %in% 'yes', Tumor_Sample_Barcode])

escc.apobec.sig.samples = escc.sig.enrichment$Signature_Assignment[Signature %in% 'Signature_1', Tumor_Sample_Barcode]
apobec.olaps = Vennerable::Venn(list(APOBEC_enrichment = escc.apobec.enriched.samples, 
     APOBEC_signature = escc.apobec.sig.samples))

escc.apobec.sig.contrib = data.frame(APOBEC_score = escc.sig$contributions[1,])
data.table::setDT(x = escc.apobec.sig.contrib, keep.rownames = T)

escc.apobec.scores = escc.tnm$APOBEC_scores

apobec.sig.comp = merge(escc.apobec.scores, escc.apobec.sig.contrib, by.x = "Tumor_Sample_Barcode", by.y = "rn")[,.(Tumor_Sample_Barcode, APOBEC_Enrichment, APOBEC_score)]

apobec.sig.comp$enriched = ifelse(test = apobec.sig.comp$APOBEC_Enrichment >= 2, yes = "maroon", no = "gray70")
apobec.sig.comp$shape = ifelse(test = apobec.sig.comp$Tumor_Sample_Barcode %in% apobec.olaps@IntersectionSets$`10`, yes = 4, no = 19)
apobec.sig.cor = cor.test(x = apobec.sig.comp$APOBEC_score, apobec.sig.comp$APOBEC_Enrichment, method = "spearman")

par(mar = c(4, 4, 2, 2))
plot(apobec.sig.comp$APOBEC_Enrichment, apobec.sig.comp$APOBEC_score, pch = apobec.sig.comp$shape, col = apobec.sig.comp$enriched, axes = FALSE, xlab = NA, ylab = NA, xlim = c(0, 4), ylim = c(0, 1.3))
axis(side = 2, at = seq(0, 1, 0.25), lwd = 2, las = 2, font = 2)
axis(side = 1, at = seq(0, 4, 1), lwd = 2, font = 2)
abline(lm(apobec.sig.comp$APOBEC_score ~ apobec.sig.comp$APOBEC_Enrichment), col = "black", lwd = 2)
segments(x0 = 2, y0 = 0, x1 = 2, y1 = 1, lty = 2, lwd = 2)
text(x = 1, y = 0.9, adj = 1, labels = paste0("r: ", round(apobec.sig.cor$estimate, 3)), font = 2)
mtext(text = "APOBEC enrichment socre", side = 1, line = 2.5, font = 2)
mtext(text = "Signature exposure", side = 2, line = 3, font = 2)
legend(x = 0.5, y = 1.3, legend = "trinucleotideMatrix", bty = "n", col = "maroon", pch = 19, text.font = 4)
legend(x = 0.5, y = 1.2, legend = "missed by signatureEnrichment", bty = "n", col = "maroon", pch = 4, cex = 1, text.font =  4)

D

maftools::plotEnrichmentResults(enrich_res = escc.sig.enrichment, pVal = 0.01)

Supplemental figure S5

Lollipop plots for VGSC associated genes

scn5a.esca = maftools::lollipopPlot(maf = esca, gene = "SCN5A", defaultYaxis = FALSE)
##     HGNC    refseq.ID   protein.ID aa.length
## 1: SCN5A    NM_000335    NP_000326      2015
## 2: SCN5A    NM_198056    NP_932173      2016
## 3: SCN5A NM_001160161 NP_001153633      1962
## 4: SCN5A NM_001160160 NP_001153632      1983
## 5: SCN5A NM_001099405 NP_001092875      1998
## 6: SCN5A NM_001099404 NP_001092874      2016
scn3a.esca = maftools::lollipopPlot(maf = esca, gene = "SCN3A", defaultYaxis = FALSE)
##     HGNC    refseq.ID   protein.ID aa.length
## 1: SCN3A NM_001081676 NP_001075145      1951
## 2: SCN3A    NM_006922    NP_008853      2000
## 3: SCN3A NM_001081677 NP_001075146      1951
scn9a.esca = maftools::lollipopPlot(maf = esca, gene = "SCN9A", defaultYaxis = FALSE)


VGSC.genes.lp = cowplot::plot_grid(scn5a.esca, scn3a.esca, scn9a.esca, nrow = 3, ncol = 1)
print(VGSC.genes.lp)

Supplemental figure S6

A

Compare MutSigCV and Oncodrive results

ms.res = data.table::fread(input = "zcat < LAML_MuSigCV/LAML-TB.sig_genes.txt.gz")
sig.genes = unique(c(as.character(laml.od[fdr < 0.1, Hugo_Symbol]), ms.res[q < 0.1, gene]))
sig.genes = merge(ms.res[gene %in% sig.genes, .(gene, q)], laml.od[Hugo_Symbol %in% sig.genes, .(Hugo_Symbol, fdr)], by.x = 'gene', by.y = 'Hugo_Symbol', all.x = TRUE)
colnames(sig.genes)[2:3] = c("OncodriveCLUST", "MutSigCV")
data.table::setDF(sig.genes)
rownames(sig.genes) = sig.genes$gene
sig.genes = sig.genes[,-1]
sig.genes$OncodriveCLUST = ifelse(test = sig.genes$OncodriveCLUST == 0, yes = .Machine$double.eps, no = sig.genes$OncodriveCLUST)

ms.smgs = as.character(ms.res[q < 0.1, gene])
od.smgs = as.character(laml.od[fdr < 0.1, Hugo_Symbol])

venn.ip = list(MutSigCV = ms.smgs, OncodriveCLUST = od.smgs)
venn.cols = RColorBrewer::brewer.pal(n = 3, name = "Spectral")[1:2]

venn.plot = VennDiagram::venn.diagram(x = venn.ip, filename = NULL, 
                          col = venn.cols, fill = venn.cols, 
                          alpha = 0.6, cex = 3, cat.cex = 0)

grid::grid.draw(x = venn.plot)

B

Oncodrive specific genes

od.specifc = c("KIT", "KRAS")
for(i in 1:length(od.specifc)){
  x = lollipopPlot(maf = laml, gene = od.specifc[i], labelPos = 'all', defaultYaxis = FALSE, cBioPortal = TRUE)
  print(x)
}

C

MutSigCV specific genes

ms.specific = c("CEBPA", "TET2", "TP53", "PHF6", "PTPN11", "STAG2", "RAD21")

for(i in 1:length(ms.specific)){
  x = lollipopPlot(maf = laml, gene = ms.specific[i], defaultYaxis = FALSE, cBioPortal = TRUE)
  print(x)
}

SessionInfo

sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] doParallel_1.0.11     iterators_1.0.9       foreach_1.4.4        
##  [4] ggplot2_3.0.0         Vennerable_3.1.0.9000 VennDiagram_1.6.20   
##  [7] futile.logger_1.4.3   NMF_0.21.0            cluster_2.0.7-1      
## [10] rngtools_1.2.4        pkgmaker_0.22         registry_0.5         
## [13] maftools_1.6.05       Biobase_2.38.0        BiocGenerics_0.24.0  
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6               matrixStats_0.53.1        
##  [3] bit64_0.9-7                RColorBrewer_1.1-2        
##  [5] progress_1.1.2             httr_1.3.1                
##  [7] rprojroot_1.3-2            GenomeInfoDb_1.14.0       
##  [9] tools_3.4.4                backports_1.1.2           
## [11] R6_2.2.2                   DBI_1.0.0                 
## [13] lazyeval_0.2.1             colorspace_1.3-2          
## [15] GetoptLong_0.1.6           withr_2.1.2               
## [17] gridExtra_2.3              tidyselect_0.2.4          
## [19] prettyunits_1.0.2          RMySQL_0.10.8             
## [21] bit_1.1-12                 compiler_3.4.4            
## [23] graph_1.56.0               formatR_1.5               
## [25] DelayedArray_0.4.1         labeling_0.3              
## [27] slam_0.1-43                rtracklayer_1.38.3        
## [29] scales_1.0.0               RBGL_1.54.0               
## [31] stringr_1.3.0              digest_0.6.15             
## [33] Rsamtools_1.30.0           rmarkdown_1.9             
## [35] cometExactTest_0.1.5       XVector_0.18.0            
## [37] pkgconfig_2.0.1            htmltools_0.3.6           
## [39] changepoint_2.2.2          BSgenome_1.46.0           
## [41] rlang_0.2.1                GlobalOptions_0.0.13      
## [43] RSQLite_2.1.0              shape_1.4.4               
## [45] bindr_0.1.1                zoo_1.8-1                 
## [47] mclust_5.4.1               BiocParallel_1.12.0       
## [49] dplyr_0.7.5                VariantAnnotation_1.24.5  
## [51] RCurl_1.95-4.10            magrittr_1.5              
## [53] GenomeInfoDbData_1.0.0     wordcloud_2.5             
## [55] Matrix_1.2-14              Rcpp_0.12.18              
## [57] munsell_0.5.0              S4Vectors_0.16.0          
## [59] stringi_1.2.2              yaml_2.1.19               
## [61] SummarizedExperiment_1.8.1 zlibbioc_1.24.0           
## [63] plyr_1.8.4                 blob_1.1.1                
## [65] ggrepel_0.8.0              lattice_0.20-35           
## [67] cowplot_0.9.3              Biostrings_2.46.0         
## [69] splines_3.4.4              GenomicFeatures_1.30.3    
## [71] circlize_0.4.3             knitr_1.20                
## [73] ComplexHeatmap_1.17.1      pillar_1.2.2              
## [75] GenomicRanges_1.30.3       rjson_0.2.20              
## [77] reshape2_1.4.3             codetools_0.2-15          
## [79] biomaRt_2.34.2             stats4_3.4.4              
## [81] futile.options_1.0.1       XML_3.98-1.11             
## [83] glue_1.2.0                 evaluate_0.10.1           
## [85] lambda.r_1.2.2             data.table_1.11.4         
## [87] gtable_0.2.0               purrr_0.2.4               
## [89] assertthat_0.2.0           gridBase_0.4-7            
## [91] xtable_1.8-2               survival_2.42-3           
## [93] tibble_1.4.2               GenomicAlignments_1.14.2  
## [95] AnnotationDbi_1.40.0       memoise_1.1.0             
## [97] IRanges_2.12.0             bindrcpp_0.2.2