library(panstripe)
library(tidyverse)
library(data.table)
library(ape)
library(ggthemes)
library(ggtree)
Load phylogenies and the gene presence/absence matrix from Gladstone et al., 2020.
pa <- read_rtab("./data/ecoli/gene_presence_absence.Rtab")
rownames(pa) <- gsub("\\.contigs.*", "", gsub("#", "_", rownames(pa)))
tree_files <- Sys.glob("./data/ecoli/clade*.nwk")
.x <- tree_files[[1]]
clades <- map(tree_files, ~{
tr <- read.tree(.x)
tr$tip.label <- gsub("_[0-9]+$", "", tr$tip.label)
stopifnot(all(tr$tip.label %in% rownames(pa)))
tpa <- pa[rownames(pa) %in% tr$tip.label, ]
tpa <- tpa[, colSums(tpa) > 0]
return(list(clade = gsub("_.*", "", gsub(".*ecoli/", "", .x)), tree = tr, pa = tpa))
})
names(clades) <- map_chr(clades, ~.x$clade)
fits <- imap(clades, ~{
panstripe(.x$pa, .x$tree)
})
First we plot the input phylogenies for each clade.
patchwork::wrap_plots(imap(clades, ~ggtree(.x$tree) + theme_tree2() + ggtitle(.y)))
We first plot a standard accumulation curve. This does not account for differences in error rates and the underlying population structure. Interestingly without controlling for population structure it would be easy to assume clade B was the most diverse. However, we show below that this is not that case which fits much better with previous phylogenetic analysis of the NORM data set.
plot_acc(map(clades, ~.x$pa)) + theme_clean(base_size = 20) + theme(plot.background = element_blank(),
legend.background = element_blank(), axis.title.x = element_blank())
ggsave("./figures/ecoli_accumulation_curves.png", width = 12, height = 7)
ggsave("./figures/ecoli_accumulation_curves.pdf", width = 12, height = 7)
If we instead plot the panstripe curves we can see that clade B appears to be quite different to the other clades and actually has lower rates of gene gain and loss. This makes sense given it is older and fits with the gain and loss of resistance described in Gladstone et al., 2020.
plot_pangenome_fits(fits, include_data = TRUE, trim = FALSE, facet = TRUE)
We can confirm this difference by performing pairwise comparisons between the slopes of fitted models. This can be done both for the association with the core branch length and with the tips of the phylogeny. Significant associations with the tips are likely to be driven by both highly mobile elements that do not persist long enough to be observed more than once as well as the effects of annotation errors.
comb <- combinat::combn2(1:length(clades))
comparisons <- map2_dfr(comb[, 1], comb[, 2], ~{
fA <- fits[[.x]]
fB <- fits[[.y]]
return(compare_pangenomes(fA, fB)$summary %>% add_column(clade1 = names(clades)[[.x]]) %>%
add_column(clade2 = names(clades)[[.y]]))
})
Look at comparisons of the rate of gene/gain loss at the core genome level
pdf <- comparisons %>% filter(term == "core") %>% arrange(p.value)
pdf$p.adj <- p.adjust(pdf$p.value, method = "BH")
knitr::kable(pdf[, c("clade1", "clade2", "statistic", "p.value", "p.adj")])
| clade1 | clade2 | statistic | p.value | p.adj |
|---|---|---|---|---|
| cladeB | cladeC1 | 5.6183114 | 0.0000000 | 0.0000003 |
| cladeB | cladeC2 | 4.6302382 | 0.0000060 | 0.0000181 |
| cladeA | cladeB | -2.7651666 | 0.0060607 | 0.0121213 |
| cladeC1 | cladeC2 | 2.1426420 | 0.0331967 | 0.0497951 |
| cladeA | cladeC2 | 1.6419633 | 0.1019127 | 0.1222953 |
| cladeA | cladeC1 | 0.1756059 | 0.8607318 | 0.8607318 |
Look at comparisons of the rate of gene/gain loss at the tips of the phylogenies
pdf <- comparisons %>% filter(term == "istip") %>% arrange(p.value)
pdf$p.adj <- p.adjust(pdf$p.value, method = "BH")
knitr::kable(pdf[, c("clade1", "clade2", "statistic", "p.value", "p.adj")])
| clade1 | clade2 | statistic | p.value | p.adj |
|---|---|---|---|---|
| cladeB | cladeC1 | 1.1645386 | 0.2452150 | 0.8487711 |
| cladeA | cladeC1 | 0.7973069 | 0.4259557 | 0.8487711 |
| cladeA | cladeB | -0.6526868 | 0.5144843 | 0.8487711 |
| cladeC1 | cladeC2 | -0.3930507 | 0.6946477 | 0.8487711 |
| cladeB | cladeC2 | 0.3759245 | 0.7073093 | 0.8487711 |
| cladeA | cladeC2 | 0.1163089 | 0.9075054 | 0.9075054 |
Look at comparisons of the rate of gene/gain loss at the tips of the phylogenies
pdf <- comparisons %>% filter(term == "istip") %>% arrange(p.value)
pdf$p.adj <- p.adjust(pdf$p.value, method = "BH")
knitr::kable(pdf[, c("clade1", "clade2", "statistic", "p.value", "p.adj")])
| clade1 | clade2 | statistic | p.value | p.adj |
|---|---|---|---|---|
| cladeB | cladeC1 | 1.1645386 | 0.2452150 | 0.8487711 |
| cladeA | cladeC1 | 0.7973069 | 0.4259557 | 0.8487711 |
| cladeA | cladeB | -0.6526868 | 0.5144843 | 0.8487711 |
| cladeC1 | cladeC2 | -0.3930507 | 0.6946477 | 0.8487711 |
| cladeB | cladeC2 | 0.3759245 | 0.7073093 | 0.8487711 |
| cladeA | cladeC2 | 0.1163089 | 0.9075054 | 0.9075054 |
We can also investigate whether there are differences in the rates of recombination events versus the size of these events between the clades.
pdf <- comparisons %>% filter(term == "dispersion model") %>% arrange(p.value)
pdf$p.adj <- p.adjust(pdf$p.value, method = "BH")
knitr::kable(pdf[, c("clade1", "clade2", "statistic", "p.value", "p.adj")])
| clade1 | clade2 | statistic | p.value | p.adj |
|---|---|---|---|---|
| cladeB | cladeC1 | 10.4105935 | 0.0012529 | 0.0075177 |
| cladeB | cladeC2 | 6.5235600 | 0.0106455 | 0.0319364 |
| cladeA | cladeC1 | 3.6585925 | 0.0557807 | 0.1115615 |
| cladeA | cladeC2 | 2.2526461 | 0.1333861 | 0.2000792 |
| cladeA | cladeB | 1.5078423 | 0.2194686 | 0.2633624 |
| cladeC1 | cladeC2 | 0.0281808 | 0.8666844 | 0.8666844 |
# load efaelcis fits
efits <- readRDS("./data/efaecalis_fits.RDS")
combined_fits <- c(fits, efits)
pdf <- plot_pangenome_fits(combined_fits, include_data = TRUE, trim = TRUE, facet = TRUE,
plot = FALSE)
pdf$point_data$species <- ifelse(grepl("pp.*", pdf$point_data$pangenome), "E. faecalis",
"E. coli")
pdf$fit_data$species <- ifelse(grepl("pp.*", pdf$fit_data$pangenome), "E. faecalis",
"E. coli")
ggplot2::ggplot(pdf$fit_data, ggplot2::aes(x = .data$core, y = .data$acc, colour = .data$pangenome)) +
ggplot2::geom_point(data = pdf$point_data, ggplot2::aes(y = .data$acc, colour = .data$pangenome),
alpha = 1) + ggplot2::geom_line() + ggplot2::geom_ribbon(ggplot2::aes(ymin = .data$lower,
ymax = .data$upper, fill = .data$pangenome), alpha = 0.3) + facet_grid(pangenome ~
species) + ggplot2::scale_colour_brewer(type = "qual", palette = 3) + ggplot2::scale_fill_brewer(type = "qual",
palette = 3) + theme_clean(base_size = 20) + theme(plot.background = element_blank(),
legend.background = element_blank(), axis.title.x = element_blank()) + ggplot2::xlab("core phylogentic branch distance")
ggsave("./figures/ecoli_efaecalis_pangenome_fits.png", width = 9, height = 15)
ggsave("./figures/ecoli_efaecalis_pangenome_fits.pdf", width = 9, height = 15)
comb <- combinat::combn2(1:length(combined_fits))
comparisons <- map2_dfr(comb[, 1], comb[, 2], ~{
fA <- combined_fits[[.x]]
fB <- combined_fits[[.y]]
return(compare_pangenomes(fA, fB, nboot = 0)$summary %>% add_column(clade1 = names(combined_fits)[[.x]]) %>%
add_column(clade2 = names(combined_fits)[[.y]]))
})
pdf <- comparisons %>% filter(term == "core") %>% arrange(p.value)
pdf$p.adj <- p.adjust(pdf$p.value, method = "BH")
knitr::kable(pdf[, c("clade1", "clade2", "estimate", "p.value", "p.adj")])
| clade1 | clade2 | estimate | p.value | p.adj |
|---|---|---|---|---|
| cladeC1 | pp2 | -0.1179210 | 0.0000000 | 0.0000000 |
| cladeC2 | pp2 | -0.1535755 | 0.0000000 | 0.0000000 |
| cladeC1 | pp18 | -0.1169506 | 0.0000000 | 0.0000000 |
| cladeB | cladeC1 | 0.0957240 | 0.0000000 | 0.0000002 |
| cladeC2 | pp18 | -0.1501964 | 0.0000001 | 0.0000005 |
| cladeA | pp2 | -0.1315701 | 0.0000002 | 0.0000006 |
| cladeC2 | pp6 | -0.1105395 | 0.0000004 | 0.0000013 |
| cladeC1 | pp6 | -0.0746640 | 0.0000025 | 0.0000065 |
| cladeB | cladeC2 | 0.1288566 | 0.0000060 | 0.0000141 |
| cladeA | pp18 | -0.1217259 | 0.0003186 | 0.0006690 |
| pp2 | pp6 | 0.0350401 | 0.0011132 | 0.0021252 |
| cladeA | cladeB | -0.0921177 | 0.0060607 | 0.0106061 |
| cladeA | pp6 | -0.0725774 | 0.0071059 | 0.0114788 |
| pp18 | pp6 | 0.0280971 | 0.0110173 | 0.0165260 |
| cladeC1 | cladeC2 | 0.0511969 | 0.0331967 | 0.0464754 |
| cladeB | pp18 | -0.0193347 | 0.0801598 | 0.1052097 |
| cladeB | pp2 | -0.0095319 | 0.0982654 | 0.1188982 |
| cladeA | cladeC2 | 0.0589160 | 0.1019127 | 0.1188982 |
| cladeB | pp6 | 0.0136439 | 0.2931642 | 0.3240236 |
| cladeA | cladeC1 | 0.0052923 | 0.8607318 | 0.9037684 |
| pp18 | pp2 | 0.0004274 | 0.9575907 | 0.9575907 |
pdf <- comparisons %>% filter(term == "istip") %>% arrange(p.value)
pdf$p.adj <- p.adjust(pdf$p.value, method = "BH")
knitr::kable(pdf[, c("clade1", "clade2", "estimate", "p.value", "p.adj")])
| clade1 | clade2 | estimate | p.value | p.adj |
|---|---|---|---|---|
| cladeC1 | pp2 | -1.2123586 | 0.0000456 | 0.0009575 |
| cladeC2 | pp2 | -0.8540567 | 0.0033857 | 0.0355498 |
| cladeA | pp2 | -0.8634859 | 0.0066134 | 0.0462938 |
| cladeC1 | pp6 | -0.8603867 | 0.0412070 | 0.2163369 |
| cladeB | pp2 | -0.5485022 | 0.0588120 | 0.2470103 |
| cladeC1 | pp18 | -1.0550458 | 0.0712276 | 0.2492967 |
| cladeA | pp6 | -0.6191140 | 0.1748901 | 0.4608102 |
| cladeC2 | pp6 | -0.5546974 | 0.1755467 | 0.4608102 |
| cladeC2 | pp18 | -0.7091404 | 0.2143857 | 0.4626142 |
| cladeA | pp18 | -0.7572843 | 0.2202925 | 0.4626142 |
| cladeB | cladeC1 | 0.5598526 | 0.2452150 | 0.4681378 |
| cladeB | pp6 | -0.4660805 | 0.2723241 | 0.4765672 |
| cladeB | pp18 | -0.5765139 | 0.3175483 | 0.5129626 |
| pp2 | pp6 | 0.2316669 | 0.4086394 | 0.5963380 |
| cladeA | cladeC1 | 0.3998273 | 0.4259557 | 0.5963380 |
| cladeA | cladeB | -0.3210117 | 0.5144843 | 0.6752607 |
| pp18 | pp6 | 0.1678748 | 0.6411253 | 0.7817629 |
| cladeC1 | cladeC2 | -0.1791053 | 0.6946477 | 0.7817629 |
| cladeB | cladeC2 | 0.1844445 | 0.7073093 | 0.7817629 |
| pp18 | pp2 | -0.0987072 | 0.7574540 | 0.7953267 |
| cladeA | cladeC2 | 0.0597987 | 0.9075054 | 0.9075054 |
pdf <- comparisons %>% filter(term == "dispersion model") %>% arrange(p.value)
pdf$p.adj <- p.adjust(pdf$p.value, method = "BH")
knitr::kable(pdf[, c("clade1", "clade2", "estimate", "p.value", "p.adj")])
| clade1 | clade2 | estimate | p.value | p.adj |
|---|---|---|---|---|
| cladeC1 | pp2 | NA | 0.0000000 | 0.0000000 |
| cladeC1 | pp6 | NA | 0.0000000 | 0.0000000 |
| cladeC2 | pp2 | NA | 0.0000000 | 0.0000001 |
| cladeC2 | pp6 | NA | 0.0000002 | 0.0000012 |
| cladeA | pp2 | NA | 0.0000005 | 0.0000020 |
| cladeA | pp6 | NA | 0.0000086 | 0.0000266 |
| cladeC1 | pp18 | NA | 0.0000089 | 0.0000266 |
| cladeB | pp2 | NA | 0.0002078 | 0.0005455 |
| cladeC2 | pp18 | NA | 0.0004596 | 0.0010724 |
| cladeB | cladeC1 | NA | 0.0012529 | 0.0025854 |
| cladeB | pp6 | NA | 0.0013543 | 0.0025854 |
| cladeB | cladeC2 | NA | 0.0106455 | 0.0186296 |
| cladeA | pp18 | NA | 0.0115933 | 0.0187276 |
| cladeA | cladeC1 | NA | 0.0557807 | 0.0836711 |
| cladeA | cladeC2 | NA | 0.1333861 | 0.1867406 |
| cladeB | pp18 | NA | 0.1612102 | 0.2115884 |
| cladeA | cladeB | NA | 0.2194686 | 0.2711083 |
| pp18 | pp6 | NA | 0.2501810 | 0.2918779 |
| pp18 | pp2 | NA | 0.3501297 | 0.3869855 |
| pp2 | pp6 | NA | 0.7860377 | 0.8253396 |
| cladeC1 | cladeC2 | NA | 0.8666844 | 0.8666844 |
plot_acc(map(combined_fits, ~.x$pa), color_pallete = 3) + theme_clean(base_size = 20) +
theme(plot.background = element_blank(), legend.background = element_blank(),
axis.title.x = element_blank())
ggsave("./figures/ecoli_efaecalis_accumulation_curves.png", width = 12, height = 7)
ggsave("./figures/ecoli_efaecalis_accumulation_curves.pdf", width = 12, height = 7)