Supplemental Data 2 for Boot et al., “Characterization of colibactin-associated mutational signature in an Asian oral squamous cell carcinoma and in other mucosal tumor types”

This script applies the mSigAct signature presence test (Ng et al. 2017), to evaluate the hypothesis that the mutational signature identified in OSCC 62074759 (the A^nT signature) is present in Candidate tumors. Candidate tumors were previously identified as tumors showing strong enrichment for mutations with the characteristics of A^nT as described in Supplemental Data 1 of this manuscript.

Methods / code

We start with the following input data:

  • candidates is the tumors identified using the script in Supplementary Data 1 (this is the ‘results’ object from this script)

  • PCAWG attributions contain 4 mutation files, one for each of the 4 major groups of tumors in the PCAWG7 analysis of mutational signatures (Alexandrov et al., 2018, downloaded from: https://www.synapse.org/#!Synapse:syn11804065 on June 24th 2018).

First, extract for each of the candidates, the PCAWG7 signature assignments which will be later used in the analysis

library(stringr)
candidates<-read.csv("candidatesW_AnT.txt",sep="\t",as.is=T)
## remove 62074759
candidates<-candidates[-nrow(candidates),]
candidates$sample<-str_split_fixed(rownames(candidates),"___",2)[,1]

## rename the tumor types in the candidates table because of different annotations
tumorTypes<-read.csv("tumorTypes.txt",sep="\t",as.is=T,row.names = 1)
for(i in rownames(tumorTypes)){
  candidates$sample<-gsub(i,
                          tumorTypes$annotation[rownames(tumorTypes) == i],
                          candidates$sample)
}

## specify which file we should look in for the attributions
candidates$attributions[candidates$dataType == "WES_Other"]<-"nonPCAWG_WES"
candidates$attributions[candidates$dataType == "WES_TCGA"]<-"TCGA_WES"
candidates$attributions[candidates$dataType == "WGS_Other"]<-"nonPCAWG_WGS"
candidates$attributions[candidates$dataType == "WGS_ICGC"]<-"PCAWG_WGS"

## gather the signature attributions for the candidate samples
files<-list.files("PCAWG7 attributions",full.names = T)
attributions<-NULL
for(i in files){
  df<-read.csv(i,as.is=T)
  samples<-paste0(df$Cancer.Types,"::",df$Sample.Names)
  df2<-df[samples %in% candidates$sample,]
  attributions<-rbind(attributions,df2)
}

load scripts for mSigAct analysis and prep signatures and catalogs

source("src/mSigTools.v0.13.R")
## Loading required package: SnowballC
source("src/mSigAct.v0.10.R")
## 
## Attaching package: 'sets'
## The following object is masked from 'package:stringr':
## 
##     %>%
## load signatures
cosmic.sigs <- 
  get.signatures(
    signature.file='PCAWG7and62074759_96.tsv',
    exome.op=.h19.96.sureselect.v6.op)
cosmic.wes <- cosmic.sigs$exome
cosmic.wgs <- cosmic.sigs$genome
rm(cosmic.sigs)
# Downstream mSigAct requires that the elements of signatures sum to exactly 1. 
# Eventually move this code to mSigAct or change to all.equals tolerance=....
# We need to run multiple sweep call, presumably because of rounding.
for (i in 1:3) cosmic.wes <- sweep(cosmic.wes, MARGIN=2, colSums(cosmic.wes), '/')
stopifnot(colSums(cosmic.wes) == 1)
for (i in 1:3) cosmic.wgs <- sweep(cosmic.wgs, MARGIN=2, colSums(cosmic.wgs), '/')
stopifnot(colSums(cosmic.wgs) == 1)

## load the catalogs of the candidate tumors
exomes<-read.csv("catalogs/spectrum_counts_exomes.txt",sep="\t",as.is=T)
genomes<-read.csv("catalogs/spectrum_counts_genomes.txt",sep="\t",as.is=T)

Define function to run mSigAct per sample, using the signatures attributed in the PCAWG analysis and AnT.

sortSigs<-function(sigs,base="SBS"){
  sigs<-sigs[order(as.numeric(gsub("a","",
                                   gsub("b","",
                                        gsub("c","",
                                             gsub("d","",
                                                  gsub(base,"",sigs)))))))]
  return(sigs)
}

run.mSigAct.per.smp<-function(smp){
  tmp<-attributions[attributions$Sample.Names == smp,4:ncol(attributions)]
  sigs<-colnames(tmp)[!tmp[1,] == 0]
  
  ## signatures SBS1, SBS5 and SBS40 are present in all tumors 
  ## therefore regardless of previous assignments, 
  ## add SBS1 and SBS5 in this analysis
  if(!"SBS1" %in% sigs){sigs<-c(sigs,"SBS1")}
  if(!"SBS5" %in% sigs){sigs<-c(sigs,"SBS5")}
  sigs<-sortSigs(sigs)

  ## depending on data type, use WGS or WES signatures
  dataType<-candidates$data[grep(smp,candidates$sample)]
  dataType<-str_split_fixed(dataType,"_",2)[,1]
  
  if(dataType == "WGS"){ 
    universe<-cosmic.wgs
    catalog<-genomes
  } else {
      universe<-cosmic.wes
      catalog<-exomes
  }
  subverse<-universe[,c(sigs,"AnT")]
  
  ## ensure that catalog and signatures are in the same order
  rownames(catalog)<-paste0(catalog$Before,catalog$Ref,
                            catalog$After,catalog$Var)
  subverse<-subverse[rownames(catalog),]
  input<-as.matrix(catalog[,grep(gsub("-",".",smp),colnames(catalog))])
  colnames(input)<-smp
  rownames(input)<-rownames(subverse)
  
  analysis<-process.one.group(input, 
                             subverse,
                             target.sig.name = "AnT",
                             path.root=paste0("mSigAct_output/",smp),
                             obj.fun = obj.fun.nbinom.maxlh,
                             nbinom.size=10, ## = dispersion parameter
                             mc.cores=1)     ## = number of cores 
  
  # results
  df<-t(rbind(pval=analysis$pval,analysis$exposure))
  rownames(df)<-smp
  return(df)
}

Run mSigAct per sample and generate a summary results table

## make a df to save the attributions in
mSigAct_result<-attributions
mSigAct_result$AnT<-NA
mSigAct_result$mSigAct_pval<-0

for(smp in attributions$Sample.Names){
  df<-as.data.frame(run.mSigAct.per.smp(smp))
  mSigAct_result$mSigAct_pval[mSigAct_result$Sample.Names == smp]<-
    df$pval
  mSigAct_result[mSigAct_result$Sample.Names == smp,
                 colnames(mSigAct_result) %in% colnames(df)]<-
    df[1,2:ncol(df)]
}

## remove signatures that are not attributed to any sample
mSigAct_result<-mSigAct_result[c(2,which(colSums(mSigAct_result[,-c(1:3)])>0)+3)]
## perform multiple testing correction
mSigAct_result$mSigAct_qval<-p.adjust(mSigAct_result$mSigAct_pval,method="BH")

## format to print output table to html
for(i in 2:(ncol(mSigAct_result)-2)){mSigAct_result[,i]<-as.numeric(format(as.numeric(mSigAct_result[,i]),digits=0,scientific=FALSE))}
mSigAct_result$mSigAct_pval<-format(mSigAct_result$mSigAct_pval,digits=3)
mSigAct_result$mSigAct_qval<-format(mSigAct_result$mSigAct_qval,digits=3)
library(knitr)
kable(mSigAct_result[,-(ncol(mSigAct_result)-1)],row.names=F,align = c("l",rep("c",14)))
Sample.Names SBS1 SBS2 SBS5 SBS7a SBS7b SBS9 SBS10a SBS10b SBS13 SBS14 SBS15 SBS16 SBS17a SBS17b SBS18 SBS27 SBS28 SBS37 SBS40 SBS45 SBS57 AnT mSigAct_qval
BD121T 117 0 145 0 0 0 1119 833 0 0 0 0 0 0 0 0 0 0 0 0 0 70 3.27e-03
BD173T 0 0 0 0 0 0 0 0 0 0 0 377 0 0 0 0 0 0 0 0 0 846 6.48e-39
BD182T 0 0 349 0 0 0 0 0 0 0 0 0 0 0 0 0 879 0 0 0 0 877 1.77e-69
BD223T 0 0 160 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 80 182 2.40e-16
sysucc-311T 0 0 745 0 0 0 3913 7406 0 0 0 0 0 0 0 0 1864 0 0 0 0 231 8.01e-04
ESO-173 64 0 82 0 0 0 0 0 0 0 0 0 23 33 0 0 0 0 0 0 0 0 1.00e+00
HCC34T 0 0 249 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1.75e-01
PCSI_0060_Pa_X 33 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 32 47 6.68e-07
SKCM-JWCI-WGS-8-Tumor 0 0 125 394 187 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9.27e-01
T155 37 0 41 0 0 0 467 438 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3.25e-01
LP6005935-DNA_B03 899 0 11769 0 0 0 0 0 0 0 0 0 5759 16897 0 0 0 0 0 0 0 9577 3.63e-18
8069334 1153 0 3229 0 0 0 0 0 0 0 0 0 500 1357 0 0 0 0 0 0 0 862 3.01e-07
0047_CRUK_PC_0047_T1_DNA 496 0 1904 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 951 1.85e-16
SP22031 2490 0 4530 0 0 0 124275 80170 0 0 0 0 0 0 0 0 22766 0 0 0 0 0 4.54e-01
SP16886 0 0 12505 0 0 0 434277 251355 0 0 0 0 0 0 0 0 151145 0 0 0 0 0 7.13e-01
SP19295 4183 0 6759 0 0 0 127419 102190 0 0 0 0 0 0 0 0 17351 0 0 0 0 1965 7.91e-02
SP17905 0 0 30073 0 0 0 1215032 478155 0 0 0 0 0 0 0 0 523625 185733 0 0 0 0 8.33e-01
SP21400 9932 0 15411 0 0 0 256130 446142 0 0 0 0 0 0 0 0 65424 0 0 0 0 0 1.75e-01
SP18946 0 0 16982 0 0 0 490500 366755 0 0 0 0 0 0 0 0 86576 0 0 0 0 0 4.54e-01
SP80615 0 0 57839 0 0 0 1070130 795814 0 0 0 0 0 0 0 0 479938 0 0 0 0 22748 2.38e-02
SP81494 2341 0 12182 0 0 0 0 0 0 0 0 0 0 0 2613 0 0 0 2963 4521 0 2148 7.18e-04
SP81711 3789 0 4065 0 0 0 0 0 0 0 0 0 0 0 3537 0 0 0 4975 0 0 3908 1.08e-12
SP80754 2305 0 2948 0 0 0 0 0 0 0 0 0 0 0 2097 0 206 0 0 0 0 3653 6.48e-39
SP111026 1481 0 8490 0 0 0 0 0 0 0 0 0 9010 24066 0 0 0 0 0 0 0 3742 2.77e-10
SP111101 3450 0 3123 0 0 0 0 0 0 0 0 0 1011 1607 0 0 0 0 5476 0 0 2245 4.04e-07
SP92659 1802 0 6991 0 0 0 191629 55154 0 0 0 0 0 0 0 0 26686 0 0 0 0 0 4.54e-01
TCGA-AB-2824-03B-01W-0728-08 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 70 0 0 0 0 0 0 1.00e+00
TCGA-AB-2851-03B-01W-0728-08 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 58 0 0 0 0 0 0 1.00e+00
TCGA-AB-2867-03B-01W-0728-08 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 115 0 0 0 0 0 0 1.00e+00
TCGA-AB-2868-03B-01W-0728-08 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 201 0 0 0 0 0 3 9.49e-02
TCGA-FU-A3HZ-01A-11D-A20U-09 0 0 107 0 0 0 1418 1003 0 0 0 0 0 0 0 0 655 0 0 0 0 71 1.61e-03
TCGA-AY-4071-01A-01W-1073-09 45 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 79 0 0 40 7.69e-05
TCGA-AG-3892-01A-01W-1073-09 0 0 91 0 0 0 1932 1570 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1.00e+00
TCGA-AG-3902-01A-01W-1073-09 42 0 41 0 0 20 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 65 1.58e-07
TCGA-2H-A9GM-01A-11D-A37C-09 23 0 81 0 0 0 0 0 0 0 0 0 5 21 43 0 0 0 0 0 0 0 2.38e-01
TCGA-IG-A4QS-01A-11D-A27G-09 69 0 112 0 0 0 0 0 0 0 0 0 38 64 0 0 0 0 0 0 0 36 2.08e-02
TCGA-R6-A6L4-01A-11D-A31U-09 8 0 63 0 0 0 0 0 10 0 0 0 17 44 19 0 0 0 0 0 0 0 7.13e-01
TCGA-BA-A4IG-01A-11D-A25Y-08 0 16 81 0 0 0 0 0 27 0 0 0 0 0 0 0 0 0 0 0 0 44 1.51e-06
TCGA-BR-6453-01A-11D-1800-08 49 0 81 0 0 0 0 0 0 0 0 0 32 65 0 0 0 0 0 0 0 0 1.00e+00
TCGA-D7-A4Z0-01A-22D-A25D-08 37 0 62 0 0 0 0 0 0 0 0 0 97 169 0 0 0 0 0 0 0 0 6.07e-01
TCGA-G2-AA3B-01A-11D-A391-08 0 352 101 0 0 0 0 0 438 0 0 0 0 0 0 0 0 0 0 0 0 45 3.25e-04
TCGA-GC-A6I3-01A-11D-A31L-08 12 71 105 0 0 0 0 0 90 0 0 0 0 0 0 0 0 0 0 0 0 32 3.89e-03
TCGA-A5-A0GP-01A-11W-A062-09 76 0 202 0 0 0 950 804 0 0 0 0 0 0 0 0 195 0 0 0 0 0 3.25e-01
TCGA-AJ-A5DW-01A-11D-A27P-09 97 0 307 0 0 0 3352 1499 0 0 0 0 0 0 0 0 366 0 0 0 0 0 4.15e-01
TCGA-AP-A1E0-01A-11D-A135-09 216 0 325 0 0 0 4978 2070 0 0 0 0 0 0 0 0 810 0 0 0 0 0 9.48e-01
TCGA-AX-A1CE-01A-11D-A135-09 0 0 190 0 0 0 0 0 0 4221 15172 0 0 0 0 0 0 0 0 0 0 0 1.13e-01
TCGA-BK-A6W3-01A-12D-A34Q-09 123 0 335 0 0 0 4032 1867 0 0 0 0 0 0 0 0 722 0 0 0 0 90 4.80e-02
TCGA-DF-A2KV-01A-11D-A17W-09 38 0 74 0 0 0 1008 938 0 0 0 0 0 0 0 0 286 0 0 0 0 47 2.34e-02
TCGA-E6-A1M0-01A-11D-A142-09 56 0 126 0 0 0 1913 778 0 0 0 0 0 0 0 0 296 0 0 0 0 0 4.55e-01
TCGA-EO-A3AV-01A-12D-A19Y-09 0 0 398 0 0 0 4037 1899 0 0 0 0 0 0 0 0 990 0 0 0 0 0 1.94e-01
TCGA-EO-A3AY-01A-12D-A19Y-09 0 0 333 0 0 0 3805 1489 0 0 0 0 0 0 0 0 686 0 0 0 0 0 1.13e-01
TCGA-EY-A1GD-01A-11D-A13L-09 57 0 173 0 0 0 581 771 0 0 0 0 0 0 0 0 274 0 0 0 0 38 7.91e-02
TCGA-EY-A1GI-01A-11D-A13L-09 101 0 421 0 0 0 4053 2481 0 0 0 0 0 0 0 0 818 0 0 0 0 0 2.07e-01
TCGA-QF-A5YS-01A-11D-A31U-09 115 0 110 0 0 0 1152 1141 0 0 0 0 0 0 0 0 220 0 0 0 0 50 2.38e-02
TCGA-QS-A5YQ-01A-11D-A31U-09 58 0 128 0 0 0 857 485 0 0 0 0 0 0 0 0 142 0 0 0 0 0 7.23e-01