These are the scripts used to search for the presence of the novel mutational signature (SBS_AnT) in the data set of 19,184 whole-exomes and 4,645 whole-genomes that was compiled for the PCAWG on Mutational Signatures (Alexandrov et al., 2018).
SBS_AnT is characterized by extremely strong enrichment for adenines 4bp and 3bp 5’ of mutated thymines.
We will use this characteristic to search for this mutational signature in the whole-exome and whole-genome data.
To test for enrichment we perform binomial tests, comparing: a) the proportion of T>N mutations with an adenine 3 or 4bp 5’ against b) the proportion thymines in the human genome or exome that has an adenine 3 or 4bp 5’
The null hypothesis is that the proportion of T>N mutations with A at -3 (or -4) is equal to the proportion of thymines in the human genome/exome with A at -3 (or -4)
We retrieved the variant calls for 19,184 whole-exomes and 4,645 whole-genomes, and counted:
We load these data into data frame df, which is keyed by the name of the sample and also contains the field dataType indicating whether the data are whole-genome or whole-exome.
df<-read.csv("input/mutCounts_allExomes+allGenomes.txt",sep="\t",as.is=T)
## keep only samples with >= 50 (exome) or >= 500 (genome) thymine mutations
df<-df[c(which(grepl("WGS",df$dataType) & df$T2Ncount>499),
which(grepl("WES",df$dataType) & df$T2Ncount>49)), ]
## load the tri- tetra- and pentanucleotide abundances. After load we have
## variable *opp* which contains abundance of tri, tetra, and penta nucleotides
## for the human genome and exome.
load("input/trinucContent_hs37d5.RData")
source('src/mSigTools.v0.13.R')
## Loading required package: SnowballC
rev.comp <- function(seq) { revc(toupper(seq)) }
The following function tests for statistically significant enrichment for the adenines at the -3 and -4 position relative to mutations from thymines.
enrichmentTest<-function(df){
df$pval_ANNT2ANNN<-NA
df$pval_ANNNT2ANNNN<-NA
# Create columns for the reverse complements of tetra and penta nucleotides
tmpOpp4<-opp$`4bp`
for(ii in 1:nrow(tmpOpp4)){tmpOpp4$revComp[ii]<-rev.comp(tmpOpp4$NA.[ii])}
tmpOpp5<-opp$`5bp`
for(ii in 1:nrow(tmpOpp5)){tmpOpp5$revComp[ii]<-rev.comp(tmpOpp5$NA.[ii])}
for(i in 1:nrow(df)){
dataType<-df$dataType[i]
if(grepl("WGS",dataType)) dataType <- "genome"
if(grepl("WES",dataType)) dataType <- "exome..agilent.V6."
# test1:
#
# - H0: proportion of T>N mutations with A at -3 is the same as the
# proportion of all T's in the genome with A at -3.
#
# - HA: proportion of T>N mutations with A at -3 is > the above
obs.mut<-df$T2Ncount[i] ## total number of T>N mutations
obs.mut.target<-df$ANNTcount[i] ## total number of T>N mutations with an A 3bp 5'
abundance_T2N<-sum(tmpOpp4[substr(tmpOpp4$NA.,4,4) == "T" |
substr(tmpOpp4$revComp,1,1) == "A",
dataType])
abundance_ANNT2ANNN<-sum(tmpOpp4[(substr(tmpOpp4$NA.,4,4) == "T" &
substr(tmpOpp4$NA.,1,1) == "A")
|
(substr(tmpOpp4$revComp,1,1) == "A" &
substr(tmpOpp4$revComp,4,4) == "T"),
dataType])
stopifnot(mode(obs.mut.target) == "numeric")
stopifnot(mode(obs.mut) == "numeric")
testResult<-binom.test(obs.mut.target,
obs.mut,
abundance_ANNT2ANNN/abundance_T2N,
alternative = "greater")
df$pval_ANNT2ANNN[i]<-testResult$p.value
# test2:
#
# - H0: proportion of T>N mutations with A at -4 is the same as the
# proportion of all T's in the genome with A at -4.
#
# - HA: proportion of T>N mutations with A at -4 is > the above
obs.mut<-df$T2Ncount[i] ## total number of T>N mutations
obs.mut.target<-df$ANNNTcount[i] ## total number of T>N mutations with an A 3bp 5'
abundance_T2N<-sum(tmpOpp5[substr(tmpOpp5$NA.,5,5) == "T" |
substr(tmpOpp5$revComp,1,1) == "A",
dataType])
abundance_ANNNT2ANNNN<-sum(tmpOpp5[(substr(tmpOpp5$NA.,5,5) == "T" &
substr(tmpOpp5$NA.,1,1) == "A" )
|
(substr(tmpOpp5$revComp,1,1) == "A" &
substr(tmpOpp5$revComp,5,5) == "T")
,dataType])
stopifnot(mode(obs.mut.target) == "numeric")
stopifnot(mode(obs.mut) == "numeric")
testResult<-binom.test(obs.mut.target,
obs.mut,
abundance_ANNNT2ANNNN/abundance_T2N,
alternative = "greater")
df$pval_ANNNT2ANNNN[i]<-testResult$p.value
}
return(df)
}
Find samples with mutations from thymine that have statistically significant enrichment for adenines at the -3 and -4 positions and with proportions of adenines above determined cutoffs.
## add the counts for sample 62074759
df<-rbind(df,'62074759T'=list("WGS",34905,27375,22695,18549))
df<-enrichmentTest(df)
## only look at tumors that fit all criteria
result<-df[ p.adjust(df$pval_ANNT2ANNN,method="BH") < 0.05 &
p.adjust(df$pval_ANNNT2ANNNN,method="BH") < 0.05 ,]
## plot scatter of enrichment for adenines 3 and 4bp 5' of mutated thymines
plot(df$ANNTcount/df$T2Ncount,df$ANNNTcount/df$T2Ncount,
pch=16,cex=0.5,
main="Identification of samples with enrichment of adenines\n(3 and 4bp 5' of mutated thymines)",
xlab="Proportion thymines with adenine 3bp 5'",
ylab="Proportion thymines with adenine 4bp 5'")
points(result$ANNTcount/result$T2Ncount,result$ANNNTcount/result$T2Ncount,
pch=16,cex=0.5,col="magenta")
points(result$ANNTcount[nrow(result)]/result$T2Ncount[nrow(result)],
result$ANNNTcount[nrow(result)]/result$T2Ncount[nrow(result)],
pch=8,cex=0.8,col="red",lwd=1.5)
## expected ratio of A's 5' of thymines (abundance in the genome)
points(c(325868953,3890707)/c(1074123658,14758195),
c(318941964,3724456)/c(1074123658,14758195),
pch=c(3,4),cex=0.8,col="lightgreen",lwd=3)
legend("topleft",
legend=c("not significant","significant","62074759",
"expected_exomes","expected_genomes"),
col=c("black","magenta","red","lightgreen","lightgreen"),
pch=c(16,16,8,4,3),
pt.lwd=c(1,1,1.5,2,2),
pt.cex=1.1,cex=0.7,bty="n",ncol=1)
selected <- result[result$ANNTcount/result$T2Ncount > 0.4 |
result$ANNNTcount/result$T2Ncount > 0.4,]
## format to print output table to html
selected$pval_ANNT2ANNN<-format(selected$pval_ANNT2ANNN,digits=3)
selected$pval_ANNNT2ANNNN<-format(selected$pval_ANNNT2ANNNN,digits=3)
library(knitr)
knitr::kable(selected,row.names = T,
align = "c",
caption = paste("Whole-exome and whole-genome sequenced samples",
"with likely AnT exposure based on enrichment of",
"adenines 5' of mutated thymines"))
| dataType | totalMuts | T2Ncount | ANNTcount | ANNNTcount | pval_ANNT2ANNN | pval_ANNNT2ANNNN | |
|---|---|---|---|---|---|---|---|
| Eso-AdenoCa::LP6005935-DNA_B03___ICGC:ESAD-UK | WGS_Other | 45144 | 33324 | 13342 | 12543 | 6.67e-309 | 7.14e-228 |
| Panc-AdenoCa::8069334___ICGC:PACA-AU | WGS_Other | 7164 | 3682 | 1483 | 1331 | 1.19e-37 | 7.78e-19 |
| Prost-AdenoCa::0047_CRUK_PC_0047_T1_DNA___ICGC:PRAD-UK | WGS_Other | 3369 | 1514 | 863 | 730 | 3.36e-102 | 2.09e-53 |
| COAD-US::SP22031___PCAWG | WGS_ICGC | 234336 | 71001 | 31769 | 33139 | 0.00e+00 | 0.00e+00 |
| COAD-US::SP16886___PCAWG | WGS_ICGC | 850298 | 337390 | 149095 | 152004 | 0.00e+00 | 0.00e+00 |
| COAD-US::SP19295___PCAWG | WGS_ICGC | 260008 | 72187 | 31520 | 32393 | 0.00e+00 | 0.00e+00 |
| COAD-US::SP17905___PCAWG | WGS_ICGC | 2439746 | 1129261 | 475376 | 468709 | 0.00e+00 | 0.00e+00 |
| COAD-US::SP21400___PCAWG | WGS_ICGC | 794330 | 308195 | 127033 | 121523 | 0.00e+00 | 0.00e+00 |
| COAD-US::SP18946___PCAWG | WGS_ICGC | 962132 | 330311 | 143781 | 139479 | 0.00e+00 | 0.00e+00 |
| ESAD-UK::SP111026___PCAWG | WGS_ICGC | 47007 | 38593 | 15452 | 14388 | 0.00e+00 | 1.02e-241 |
| ESAD-UK::SP111101___PCAWG | WGS_ICGC | 17017 | 7777 | 3192 | 2964 | 4.64e-89 | 6.24e-61 |
| READ-US::SP80615___PCAWG | WGS_ICGC | 2433765 | 1066128 | 458226 | 453231 | 0.00e+00 | 0.00e+00 |
| READ-US::SP81494___PCAWG | WGS_ICGC | 26908 | 8134 | 3434 | 3092 | 1.54e-113 | 2.87e-62 |
| READ-US::SP81711___PCAWG | WGS_ICGC | 20415 | 6971 | 3155 | 2827 | 5.01e-151 | 1.36e-87 |
| READ-US::SP80754___PCAWG | WGS_ICGC | 11274 | 4522 | 2738 | 2290 | 0.00e+00 | 4.21e-196 |
| UCEC-US::SP92659___PCAWG | WGS_ICGC | 282392 | 77606 | 36861 | 38199 | 0.00e+00 | 0.00e+00 |
| Biliary-AdenoCa::BD121T___ICGC:BTCA-JP | WES_Other | 2284 | 447 | 180 | 171 | 2.04e-10 | 9.16e-10 |
| Biliary-AdenoCa::BD173T___ICGC:BTCA-JP | WES_Other | 1231 | 749 | 678 | 513 | 3.89e-300 | 4.91e-136 |
| Biliary-AdenoCa::BD182T___ICGC:BTCA-JP | WES_Other | 2117 | 1814 | 974 | 771 | 5.30e-132 | 8.77e-58 |
| Biliary-AdenoCa::BD223T___ICGC:BTCA-JP | WES_Other | 425 | 218 | 148 | 113 | 4.29e-37 | 4.17e-17 |
| ColoRect-AdenoCa::sysucc-311T___ICGC:COCA-CN | WES_Other | 14191 | 5763 | 2307 | 2190 | 1.41e-109 | 3.78e-101 |
| Liver-HCC::HCC34T___ICGC:LINC-JP | WES_Other | 249 | 96 | 40 | 38 | 9.32e-04 | 1.40e-03 |
| Panc-AdenoCa::PCSI_0060_Pa_X___ICGC:PACA-CA | WES_Other | 113 | 55 | 43 | 33 | 2.03e-15 | 5.09e-08 |
| Skin-Melanoma::SKCM-JWCI-WGS-8-Tumor___doi:10.1016/j.cell.2012.06.024 | WES_Other | 705 | 57 | 24 | 24 | 7.80e-03 | 3.96e-03 |
| Eso-AdenoCa::ESO-173___PMID:23525077 | WES_Other | 202 | 77 | 35 | 30 | 2.75e-04 | 5.52e-03 |
| Uterus-AdenoCa::T155___PMID:23104009 | WES_Other | 983 | 186 | 83 | 73 | 8.50e-08 | 1.84e-05 |
| ACUTE MYELOID LEUKEMIA::TCGA-AB-2824-03B-01W-0728-08___TCGA-LAML | WES_TCGA | 72 | 71 | 46 | 35 | 1.74e-11 | 1.10e-05 |
| ACUTE MYELOID LEUKEMIA::TCGA-AB-2851-03B-01W-0728-08___TCGA-LAML | WES_TCGA | 60 | 60 | 45 | 24 | 7.02e-15 | 8.47e-03 |
| ACUTE MYELOID LEUKEMIA::TCGA-AB-2867-03B-01W-0728-08___TCGA-LAML | WES_TCGA | 115 | 104 | 85 | 40 | 1.00e-31 | 2.01e-03 |
| ACUTE MYELOID LEUKEMIA::TCGA-AB-2868-03B-01W-0728-08___TCGA-LAML | WES_TCGA | 212 | 197 | 159 | 82 | 1.46e-56 | 3.68e-07 |
| BLADDER UROTHELIAL CARCINOMA::TCGA-G2-AA3B-01A-11D-A391-08___TCGA-BLCA | WES_TCGA | 946 | 66 | 38 | 26 | 1.04e-07 | 7.94e-03 |
| BLADDER UROTHELIAL CARCINOMA::TCGA-GC-A6I3-01A-11D-A31L-08___TCGA-BLCA | WES_TCGA | 346 | 64 | 38 | 27 | 3.33e-08 | 2.25e-03 |
| CERVICAL SQUAMOUS CELL CARCINOMA AND ENDOCERVICAL ADENOCARCINOMA::TCGA-FU-A3HZ-01A-11D-A20U-09___TCGA-CESC | WES_TCGA | 3256 | 1279 | 493 | 523 | 5.53e-21 | 1.46e-34 |
| COLON ADENOCARCINOMA::TCGA-AY-4071-01A-01W-1073-09___TCGA-COAD | WES_TCGA | 165 | 53 | 31 | 24 | 9.56e-07 | 1.21e-03 |
| ESOPHAGEAL CARCINOMA::TCGA-2H-A9GM-01A-11D-A37C-09___TCGA-ESCA | WES_TCGA | 173 | 61 | 31 | 24 | 4.60e-05 | 1.07e-02 |
| ESOPHAGEAL CARCINOMA::TCGA-IG-A4QS-01A-11D-A27G-09___TCGA-ESCA | WES_TCGA | 321 | 154 | 57 | 66 | 2.81e-03 | 1.42e-06 |
| ESOPHAGEAL CARCINOMA::TCGA-R6-A6L4-01A-11D-A31U-09___TCGA-ESCA | WES_TCGA | 163 | 81 | 34 | 37 | 1.86e-03 | 5.31e-05 |
| HEAD AND NECK SQUAMOUS CELL CARCINOMA::TCGA-BA-A4IG-01A-11D-A25Y-08___TCGA-HNSC | WES_TCGA | 170 | 52 | 35 | 25 | 9.57e-10 | 3.15e-04 |
| RECTUM ADENOCARCINOMA::TCGA-AG-3892-01A-01W-1073-09___TCGA-READ | WES_TCGA | 3593 | 976 | 406 | 397 | 1.85e-24 | 3.72e-26 |
| RECTUM ADENOCARCINOMA::TCGA-AG-3902-01A-01W-1073-09___TCGA-READ | WES_TCGA | 178 | 66 | 43 | 38 | 6.01e-11 | 2.46e-08 |
| STOMACH ADENOCARCINOMA::TCGA-BR-6453-01A-11D-1800-08___TCGA-STAD | WES_TCGA | 228 | 125 | 53 | 44 | 9.01e-05 | 8.40e-03 |
| STOMACH ADENOCARCINOMA::TCGA-D7-A4Z0-01A-22D-A25D-08___TCGA-STAD | WES_TCGA | 369 | 279 | 116 | 99 | 3.82e-08 | 9.16e-05 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-A5-A0GP-01A-11W-A062-09___TCGA-UCEC | WES_TCGA | 2229 | 589 | 241 | 214 | 2.81e-14 | 1.62e-09 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-AJ-A5DW-01A-11D-A27P-09___TCGA-UCEC | WES_TCGA | 5623 | 1141 | 454 | 460 | 1.67e-22 | 5.21e-29 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-AP-A1E0-01A-11D-A135-09___TCGA-UCEC | WES_TCGA | 8411 | 1853 | 726 | 749 | 1.75e-32 | 1.57e-46 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-AX-A1CE-01A-11D-A135-09___TCGA-UCEC | WES_TCGA | 19621 | 3038 | 1379 | 1097 | 4.74e-110 | 2.32e-40 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-BK-A6W3-01A-12D-A34Q-09___TCGA-UCEC | WES_TCGA | 7184 | 1672 | 672 | 649 | 6.42e-34 | 2.15e-34 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-DF-A2KV-01A-11D-A17W-09___TCGA-UCEC | WES_TCGA | 2391 | 654 | 254 | 273 | 4.93e-12 | 2.43e-20 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-E6-A1M0-01A-11D-A142-09___TCGA-UCEC | WES_TCGA | 4792 | 1011 | 418 | 401 | 1.61e-24 | 5.95e-24 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-EO-A3AV-01A-12D-A19Y-09___TCGA-UCEC | WES_TCGA | 7328 | 2014 | 821 | 811 | 1.19e-43 | 1.79e-49 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-EO-A3AY-01A-12D-A19Y-09___TCGA-UCEC | WES_TCGA | 6317 | 1731 | 728 | 663 | 2.49e-44 | 3.93e-33 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-EY-A1GD-01A-11D-A13L-09___TCGA-UCEC | WES_TCGA | 1895 | 701 | 282 | 256 | 2.54e-15 | 2.52e-11 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-EY-A1GI-01A-11D-A13L-09___TCGA-UCEC | WES_TCGA | 7878 | 1787 | 722 | 708 | 3.74e-37 | 1.09e-40 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-QF-A5YS-01A-11D-A31U-09___TCGA-UCEC | WES_TCGA | 2789 | 534 | 228 | 231 | 5.70e-16 | 1.06e-19 |
| UTERINE CORPUS ENDOMETRIAL CARCINOMA::TCGA-QS-A5YQ-01A-11D-A31U-09___TCGA-UCEC | WES_TCGA | 1670 | 323 | 128 | 139 | 2.11e-07 | 2.57e-12 |
| 62074759T | WGS | 34905 | 27375 | 22695 | 18549 | 0.00e+00 | 0.00e+00 |
write.table(selected,file="candidatesW_AnT.txt",sep="\t",quote=F,row.names = T)