library(panstripe)
library(tidyverse)
library(data.table)
library(ape)
library(ggthemes)
library(ggtree)
library(ggtreeExtra)
pa <- read_rtab("./data/efaecalis/Efcs_gene_presence_absence.Rtab")
rownames(pa) <- gsub("\\.velvet.*", "", rownames(pa))
tree_files <- Sys.glob("./data/efaecalis/*.tre")
clades <- map(tree_files, ~{
tree <- read.nexus(.x)
tree$tip.label <- gsub(".*/", "", gsub("_[0-9]_[0-9]+$", "", tree$tip.label))
name <- gsub(".*/", "", gsub("_.*", "", .x))
tpa <- pa[rownames(pa) %in% tree$tip.label, ]
return(list(clade = name, pa = tpa, tree = tree))
})
names(clades) <- map_chr(clades, ~.x$clade)
patchwork::wrap_plots(map(clades, ~ggtree(.x$tree) + ggtree::theme_tree2() + ggtitle(.x$clade)))
fits <- map(clades, ~panstripe(.x$pa, .x$tree, quiet = FALSE))
fits$pp18$summary
#> # A tibble: 7 × 7
#> term estimate std.error statistic p.value `bootstrap CI …` `bootstrap CI …`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Inter… 5.47 1.00 5.47 1.14e-6 3.33 7.60
#> 2 istip 0.818 0.309 2.65 1.05e-2 -0.102 1.86
#> 3 core 0.00970 0.0222 0.436 6.64e-1 -0.0509 0.0630
#> 4 depth -0.0236 0.0118 -1.99 5.11e-2 -0.0503 0.00460
#> 5 istip… -0.0123 0.0264 -0.465 6.44e-1 -0.156 0.101
#> 6 p 1.87 NA NA NA 1.03 2.00
#> 7 phi 1.39 NA NA NA 0.883 4.18
fits$pp2$summary
#> # A tibble: 7 × 7
#> term estimate std.error statistic p.value `bootstrap CI …` `bootstrap CI …`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Inter… 2.38 0.675 3.53 4.65e-4 1.04 3.64
#> 2 istip 0.321 0.131 2.45 1.46e-2 0.0371 0.654
#> 3 core 0.0592 0.0100 5.92 7.34e-9 0.0372 0.0791
#> 4 depth 0.00627 0.00440 1.42 1.55e-1 -0.00188 0.0152
#> 5 istip… -0.0161 0.00827 -1.95 5.20e-2 -0.0511 0.0108
#> 6 p 1.71 NA NA NA 1.66 1.76
#> 7 phi 3.28 NA NA NA 2.84 3.99
fits$pp6$summary
#> # A tibble: 7 × 7
#> term estimate std.error statistic p.value `bootstrap CI …` `bootstrap CI …`
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Inte… 4.08 0.401 10.2 1.25e-19 3.34 4.88
#> 2 istip 0.501 0.224 2.24 2.66e- 2 0.0117 0.982
#> 3 core 0.0688 0.0200 3.44 7.17e- 4 0.0201 0.110
#> 4 depth -0.0314 0.0131 -2.39 1.78e- 2 -0.0583 -0.00411
#> 5 isti… 0.0266 0.0278 0.957 3.40e- 1 -0.0498 0.0982
#> 6 p 1.66 NA NA NA 1.60 1.74
#> 7 phi 4.33 NA NA NA 3.68 5.58
saveRDS(fits, "./data/efaecalis_fits.RDS")
We first plot a standard accumulation curve. This does not account for differences in error rates and the underlying population structure.
plot_acc(map(clades, ~.x$pa)) + theme_clean(base_size = 20) + theme(plot.background = element_blank(),
legend.background = element_blank(), axis.title.x = element_blank(), panel.spacing = unit(2,
"lines"))
ggsave("./figures/efaecalis_pangenome_accumulation_curves.png", width = 12, height = 7)
ggsave("./figures/efaecalis_pangenome_accumulation_curves.pdf", width = 12, height = 7)
plot_pangenome_fits(fits, include_data = TRUE, facet = TRUE) + theme_clean(base_size = 20) +
theme(plot.background = element_blank(), legend.background = element_blank(),
axis.title.x = element_blank(), panel.spacing = unit(2, "lines"))
ggsave("./figures/efaecalis_pangenome_fits.png", width = 12, height = 7)
ggsave("./figures/efaecalis_pangenome_fits.pdf", width = 12, height = 7)
We can confirm this difference by performing pairwise comparisons between the slopes of fitted models. This can be done both for the association with the core branch length and with the tips of the phylogeny. Significant associations with the tips are likely to be driven by both highly mobile elements that do not persist long enough to be observed more than once as well as the effects of annotation errors.
comb <- combinat::combn2(1:length(clades))
comparisons <- map2_dfr(comb[, 1], comb[, 2], ~{
fA <- fits[[.x]]
fB <- fits[[.y]]
return(compare_pangenomes(fA, fB, ci_type = "norm")$summary %>% add_column(clade1 = names(clades)[[.x]]) %>%
add_column(clade2 = names(clades)[[.y]]))
})
Look at comparisons of the rate of gene/gain loss at the core genome level
pdf <- comparisons %>% filter(term == "core") %>% arrange(p.value)
pdf$p.adj <- p.adjust(pdf$p.value, method = "BH")
knitr::kable(pdf[, c("clade1", "clade2", "statistic", "p.value", "p.adj")])
| clade1 | clade2 | statistic | p.value | p.adj |
|---|---|---|---|---|
| pp2 | pp6 | 3.2774014 | 0.0011132 | 0.0033395 |
| pp18 | pp6 | 2.5619733 | 0.0110173 | 0.0165260 |
| pp18 | pp2 | 0.0532086 | 0.9575907 | 0.9575907 |
Look at comparisons of error rates
pdf <- comparisons %>% filter(term == "istip") %>% arrange(p.value)
pdf$p.adj <- p.adjust(pdf$p.value, method = "BH")
knitr::kable(pdf[, c("clade1", "clade2", "statistic", "p.value", "p.adj")])
| clade1 | clade2 | statistic | p.value | p.adj |
|---|---|---|---|---|
| pp2 | pp6 | 0.8269171 | 0.4086394 | 0.757454 |
| pp18 | pp6 | 0.4667151 | 0.6411253 | 0.757454 |
| pp18 | pp2 | -0.3090250 | 0.7574540 | 0.757454 |
Look at comparisons of dispersion
pdf <- comparisons %>% filter(term == "dispersion model") %>% arrange(p.value)
pdf$p.adj <- p.adjust(pdf$p.value, method = "BH")
knitr::kable(pdf[, c("clade1", "clade2", "statistic", "p.value", "p.adj")])
| clade1 | clade2 | statistic | p.value | p.adj |
|---|---|---|---|---|
| pp18 | pp6 | 1.3222926 | 0.2501810 | 0.5251946 |
| pp18 | pp2 | 0.8729870 | 0.3501297 | 0.5251946 |
| pp2 | pp6 | 0.0736902 | 0.7860377 | 0.7860377 |