library(panstripe)
library(tidyverse)
library(data.table)
library(ggthemes)
library(ape)
Simulate pangenome evolution with sampling error rate.
set.seed(12345)
rates <- c(0, 1, 2, 5, 10)
names(rates) <- as.character(rates)
nreps <- 5
sim_error <- map(1:nreps, function(rep) map(rates, ~simulate_pan(rate = 0.001, mean_trans_size = 3,
fn_error_rate = .x, fp_error_rate = .x, ngenomes = 100)))
names(sim_error) <- 1:nreps
Create the necessary input files and run each algorithm
set up input data
imap(sim_error, function(rep, i) {
imap(rep, ~{
ape::write.tree(.x$tree, file = paste(c("./data/error_rates/panicmage/error_rate_",
.y, "_rep_", i, ".tree"), collapse = ""))
gc <- table(factor(colSums(.x$pa), levels = 1:nrow(.x$pa)))
writeLines(paste(gc, collapse = " "), paste(c("./data/error_rates/panicmage/error_rate_",
.y, "_rep_", i, ".txt"), collapse = ""))
})
})
run panicmage
for f in ./data/error_rates/panicmage/*.tree
do
prefix="${f%.*}"
echo $prefix
~/Documents/panicmage/panicmage ${prefix}.tree ${prefix}.txt 100 -n > ${prefix}_results.txt
done
load results
resfiles <- Sys.glob("./data/error_rates/panicmage/error_rate_*_results.txt")
panicmage_results <- map_dfr(resfiles, ~{
l <- read_lines(.x)
i <- which(grepl("Some characteristics.*", l))
params <- as.numeric(gsub(".*= ", "", unlist(str_split(l[[i + 1]], " \t "))))
tibble(error_rate = gsub("_rep.*", "", gsub(".*rate_", "", .x)), rep = gsub("_results.*",
"", gsub(".*rep_", "", .x)), theta = params[[1]], tho = params[[2]], core = params[[3]])
})
pdf <- panicmage_results %>% pivot_longer(cols = colnames(panicmage_results)[3:5])
pdf$error_rate <- factor(pdf$error_rate, levels = sort(unique(as.numeric(pdf$error_rate))))
ggplot(pdf, aes(x = error_rate, y = value)) + geom_point() + facet_wrap(~name, scales = "free_y")
We first need to convert the newick files into ‘tree table’ format.
for f in ./data/error_rates/panicmage/*.tree
do
prefix=$(basename $f)
prefix="${prefix%.*}"
echo $prefix
perl ~/Documents/pangenome/tre2table.pl $f > ./data/error_rates/collins/${prefix}.txt
done
We can now run the scripts from Collins et al., 2012.
source("./scripts/f-pangenome.R")
# coalescent
mymaxit <- 10000
myreltol <- 1e-06
mymodel <- "coalescent.spec" #use the coalescent tree w/G(k)
myfitting <- "chi2" #fit it using this error function
constr <- 1 # G0 constrained to the mean genome size during fitting
mymethod <- "Nelder"
collins_results <- imap_dfr(sim_error, function(rep, i) {
imap_dfr(rep, ~{
mat <- t(.x$pa)
ng <- nrow(.x$pa)
G0 <- mean(colSums(mat > 0)) # mean genome size measured in gene families
treetable <- read.table(paste(c("./data/error_rates/collins/error_rate_",
.y, "_rep_", i, ".txt"), collapse = ""), sep = "\t", row.names = 1)
colnames(treetable) <- c("desc_a", "desc_b", "dist")
# calculate gene family frequency spectrum
Gk <- f.getspectrum(mat)
# set initial parameters and recursively optimize
opt.spec.cde <- f.recurse(c(1, 100), r.data = Gk, r.genomesize = G0, r.ng = ng)
# get optimized parameters, calculate constrained parameter
params.spec.cde <- c(opt.spec.cde$par[1], opt.spec.cde$par[1] * (G0 - opt.spec.cde$par[2]),
opt.spec.cde$par[2])
# set initial parameters and recursively optimize
pinitial.spec.c2de <- c(params.spec.cde[1]/10, params.spec.cde[2]/100, params.spec.cde[3],
params.spec.cde[1] * 10000)
opt.spec.c2de <- f.recurse(pinitial.spec.c2de, r.data = Gk, r.genomesize = G0,
r.ng = ng)
# get optimized parameters, calculate constrained parameter
params.spec.c2de <- c(opt.spec.c2de$par, opt.spec.c2de$par[4] * (G0 - opt.spec.c2de$par[2]/opt.spec.c2de$par[1] -
opt.spec.c2de$par[3]))
print(params.spec.c2de)
spec.c2de <- f.coalescent.spec(params.spec.c2de, ng)
# summarize paramters
return(tibble(error_rate = .y, rep = i, Gess = params.spec.c2de[3], theta1 = params.spec.c2de[2],
rho1 = params.spec.c2de[1], theta2 = params.spec.c2de[5], rho2 = params.spec.c2de[4],
fslow = params.spec.c2de[2]/params.spec.c2de[1]/G0, ffast = params.spec.c2de[5]/params.spec.c2de[4]/G0,
fess = params.spec.c2de[3]/G0, Gnew100 = f.coalescent(params.spec.c2de,
100)$pan[100] - f.coalescent(params.spec.c2de, 100)$pan[99], Gnew1000 = f.coalescent(params.spec.c2de,
1000)$pan[1000] - f.coalescent(params.spec.c2de, 1000)$pan[999],
Gcore100 = f.coalescent(params.spec.c2de, 100)$core[100], Gcore1000 = f.coalescent(params.spec.c2de,
1000)$core[1000], Gpan100 = f.coalescent(params.spec.c2de, 100)$pan[100],
Gpan1000 = f.coalescent(params.spec.c2de, 1000)$pan[1000]))
})
})
pdf <- collins_results[, 1:10] %>% pivot_longer(cols = colnames(collins_results)[3:10])
pdf$error_rate <- factor(pdf$error_rate, levels = sort(unique(as.numeric(pdf$error_rate))))
Plot parameter estimates
ggplot(pdf, aes(x = error_rate, y = value)) + geom_point() + facet_wrap(~name, scales = "free_y")
CAFE was originally designed for analysing the rates of gain and loss in gene families and is not currently optimised for investigating the size of the bacterial pangenome. However, it is the only model considered that accounts for annotation errors. In this analysis we assume that all genes belong to a single ‘family’.
We provide the error file using the same distribution as was used in the original simulation.
# imap(sim_error, function (rep, i) { imap(rep, ~{ ape::write.tree(.x$tree, file
# = paste(c('./data/error_rates/CAFE/error_rate_', .y,'_rep_',i, '.tree'),
# collapse = '')) gc <- rowSums(.x$pa) writeLines(c(paste(c('Desc', 'Family ID',
# names(gc)), collapse = '\t'), paste(c('NA', 'allD', gc), collapse = '\t')),
# paste(c('./data/error_rates/CAFE/error_rate_', .y,'_rep_',i,
# '_gene_families.txt'), collapse = '')) }) })
Generate error files.
# max_genes <- ceiling(1.2*max(map_dbl(sim_error, ~ max(map_dbl(.x, function(r)
# ncol(r$pa)))))) map(rates, ~{ p <- dexp(x=1, rate = 1/.x) error_tbl <- tibble(
# 'cntdiff'=0:max_genes, '-1'=c(0, rep(p, max_genes)), '0'= c(1-p, rep(1-2*p,
# max_genes)), '1'= rep(p, max_genes+1) ) file <-
# paste(c('./data/error_rates/CAFE/error_model_rate_', .x,'.txt'), collapse = '')
# writeLines(paste0('maxcnt:', max_genes), file) write.table(error_tbl, file =
# file, append = TRUE, quote = FALSE, sep = ' ', row.names = FALSE) })
Run CAFE
~/Documents/CAFE5-5.0/bin/cafe5 -i ./data/error_rates/CAFE/error_rate_10_rep_1_gene_families.txt -t ./data/error_rates/CAFE/error_rate_10_rep_1.tree -e./data/error_rates/CAFE/error_model_rate_10.txt
Here we make use of the implementation of the FMG model as described in Zamani-Dahaj et al., 2016 and implemented in the Panaroo package.
imap(sim_error, function(rep, i) {
imap(rep, ~{
ape::write.tree(.x$tree, file = paste(c("./data/error_rates/zamani-dahaj/error_rate_",
.y, "_rep_", i, ".tree"), collapse = ""))
tb <- as_tibble(t(.x$pa)) %>% add_column(gene = colnames(.x$pa), .before = 1)
write.table(tb, sep = "\t", quote = FALSE, row.names = FALSE, file = paste(c("./data/error_rates/zamani-dahaj/error_rate_",
.y, "_rep_", i, "_pa.txt"), collapse = ""))
})
})
for f in ./data/error_rates/zamani-dahaj/*.tree
do
prefix=$(basename $f)
prefix="${prefix%.*}"
echo $prefix
python ~/Documents/panaroo/panaroo-estimate-fmg.py -o ./data/error_rates/zamani-dahaj/${prefix}.txt --pa ./data/error_rates/zamani-dahaj/${prefix}_pa.txt --tree $f
done
Load results
resfiles <- Sys.glob("./data/error_rates/zamani-dahaj/error_rate_*[0-9].txt")
zamani_results <- map_dfr(resfiles, ~{
df <- fread(.x, skip = 4, header = FALSE, col.names = c("parameter", "estimate",
"NA1", "NA2"))[, 1:2] %>% as_tibble() %>% add_column(error_rate = gsub("_rep.*",
"", gsub(".*rate_", "", .x)), .before = 1) %>% add_column(rep = gsub("\\.txt",
"", gsub(".*rep_", "", .x)), .before = 1)
return(df)
})
zamani_results$error_rate <- factor(zamani_results$error_rate, levels = sort(unique(as.numeric(zamani_results$error_rate))))
pdf <- zamani_results %>% group_by(error_rate, rep) %>% summarise(parameter = c("u",
"v"), estimate = c(estimate[which(parameter == "a")] * estimate[which(parameter ==
"M")], estimate[which(parameter == "v")]))
ggplot(pdf, aes(x = error_rate, y = estimate)) + geom_point() + facet_wrap(~parameter)
Accumulation curves do not use a statistical model by are very commonly used. Uncertainty is typically quantified using permutations. Here, we consider two curves to be different if there is no overlap in their curves generated by permutation.
nperm <- 100
curves <- imap_dfr(sim_error, function(rep, i) {
print(i)
return(imap_dfr(rep, ~{
.x <- t(.x$pa)
plotdf <- purrr::map_dfr(1:nperm, function(i) {
ppa <- .x[sample(nrow(.x), replace = FALSE), sample(ncol(.x), replace = FALSE)]
cumlative <- rowSums(apply(ppa, 1, cumsum) > 0)
cumlative <- cumlative - cumlative[[1]]
df <- tibble::tibble(N = 1:length(cumlative), naccessory = cumlative,
permutation = i)
return(df)
}) %>% tibble::add_column(pangenome = .y)
plotdf <- plotdf %>% dplyr::group_by(N, pangenome) %>% dplyr::summarise(`accessory size` = mean(naccessory),
std = sd(naccessory)) %>% add_column(rep = i, .before = 1) %>% add_column(error_rate = .y,
.before = 1)
return(plotdf)
}))
})
#> [1] "1"
#> [1] "2"
#> [1] "3"
#> [1] "4"
#> [1] "5"
curves$group <- paste(curves$error_rate, curves$rep, sep = "_")
curves$error_rate <- factor(curves$error_rate, levels = sort(unique(as.numeric(curves$error_rate))))
ggplot(curves, aes(N, `accessory size`, col = error_rate, fill = error_rate, group = group)) +
geom_ribbon(aes(ymin = `accessory size` - std, ymax = `accessory size` + std),
alpha = 0.5, col = NA) + scale_color_brewer(type = "qual", palette = 5) +
scale_fill_brewer(type = "qual", palette = 5) + geom_line(size = 1) + theme_bw(base_size = 14) +
xlab("Number of genomes") + ylab("Accessory size") + labs(fill = "error rate",
color = "error rate")
heap_df <- function(pa) {
cm <- do.call(rbind, map(1:10, ~{
ppa <- pa
ppa <- ppa[sample(nrow(ppa), replace = FALSE), sample(ncol(ppa), replace = FALSE)]
cumlative <- rowSums(apply(t(ppa), 1, cumsum) > 0)
cumlative <- cumlative - cumlative[[1]]
return(cumlative)
}))
cumulative_median <- apply(cm, 2, median)
res <- broom::tidy(lm(nunique ~ logN, tibble(logN = log(1:length(cumulative_median)),
nunique = log(cumulative_median + 0.001))))
return(res)
}
heap_results <- imap_dfr(sim_error, function(rep, i) {
return(imap_dfr(rep, ~{
pdf <- heap_df(.x$pa) %>% add_column(rep = i, .before = 1) %>% add_column(error_rate = as.numeric(.y),
.before = 1)
}))
return(pdf)
})
heap_results$term <- ifelse(heap_results$term == "logN", "alpha", "log(K)")
heap_results <- heap_results %>% filter(term == "alpha")
heap_results$error_rate <- factor(heap_results$error_rate, levels = sort(unique(heap_results$error_rate)))
ggplot(heap_results, aes(x = error_rate, y = estimate, col = rep)) + geom_point()
pp_results <- imap_dfr(sim_error, function(rep, i) {
return(imap_dfr(rep, ~{
fit <- panstripe(.x$pa, .x$tree, nboot = 0, quiet = TRUE, family = "gaussian")
pdf <- fit$summary %>% add_column(rep = i, .before = 1) %>% add_column(error_rate = .y,
.before = 1)
return(pdf)
}))
})
pp_results$error_rate <- factor(pp_results$error_rate, levels = sort(unique(as.numeric(pp_results$error_rate))))
pp_results <- pp_results %>% filter(term == "core")
ggplot(pp_results, aes(x = error_rate, y = estimate, col = rep)) + geom_point()
####Summary plot
# panicmage
pannic_res <- panicmage_results %>% pivot_longer(cols = colnames(panicmage_results)[3:5]) %>%
filter(name == "theta")
pannic_res$value <- c(scale(pannic_res$value, scale = FALSE, center = FALSE))
pannic_test <- do.call(rbind, pannic_res %>% filter(error_rate > 0) %>% group_by(error_rate) %>%
group_map(~{
broom::tidy(t.test(pannic_res$value[pannic_res$error_rate == 0], .x$value,
alternative = "two.sided")) %>% add_column(error_rate = unique(.x$error_rate),
.before = 1)
}, .keep = TRUE))
pannic_test
#> # A tibble: 4 × 11
#> error_rate estimate estimate1 estimate2 statistic p.value parameter conf.low
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -10.2 131. 142. -1.15 2.94e-1 5.85 -31.9
#> 2 10 -292. 131. 424. -25.4 1.66e-6 5.03 -322.
#> 3 2 -45.9 131. 177. -8.42 3.02e-5 7.99 -58.5
#> 4 5 -145. 131. 276. -7.74 1.04e-3 4.37 -195.
#> # … with 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
# collins
collins_res <- collins_results[, 1:10] %>% pivot_longer(cols = colnames(collins_results)[3:10]) %>%
filter(name == "theta1")
collins_res$value <- c(scale(collins_res$value, scale = FALSE, center = FALSE))
collins_test <- do.call(rbind, collins_res %>% filter(error_rate > 0) %>% group_by(error_rate) %>%
group_map(~{
broom::tidy(t.test(collins_res$value[collins_res$error_rate == 0], .x$value,
alternative = "two.sided")) %>% add_column(error_rate = unique(.x$error_rate),
.before = 1)
}, .keep = TRUE))
collins_test
#> # A tibble: 4 × 11
#> error_rate estimate estimate1 estimate2 statistic p.value parameter conf.low
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -5.40e-4 0.000107 6.46e-4 -0.987 0.378 4.07 -2.05e-3
#> 2 10 -3.09e+2 0.000107 3.09e+2 -8.46 0.00107 4.00 -4.11e+2
#> 3 2 -1.41e+2 0.000107 1.41e+2 -3.99 0.0162 4.00 -2.39e+2
#> 4 5 -1.80e+2 0.000107 1.80e+2 -8.41 0.00109 4.00 -2.40e+2
#> # … with 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
# Zamani-Dahaj
zamani_res <- zamani_results %>% filter(parameter == "v")
zamani_res$estimate <- c(scale(zamani_res$estimate, scale = FALSE, center = FALSE))
zamani_res$error_rate <- as.numeric(as.character(zamani_res$error_rate))
zamani_test <- do.call(rbind, zamani_res %>% filter(error_rate > 0) %>% group_by(error_rate) %>%
group_map(~{
broom::tidy(t.test(zamani_res$estimate[zamani_res$error_rate == 0], .x$estimate,
alternative = "two.sided")) %>% add_column(error_rate = unique(.x$error_rate),
.before = 1)
}, .keep = TRUE))
zamani_test
#> # A tibble: 4 × 11
#> error_rate estimate estimate1 estimate2 statistic p.value parameter conf.low
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -0.000316 0.00203 0.00234 -2.77 2.47e-2 7.80 -5.80e-4
#> 2 2 -0.000690 0.00203 0.00272 -5.18 9.13e-4 7.81 -9.99e-4
#> 3 5 -0.00166 0.00203 0.00369 -11.5 5.47e-6 7.42 -2.00e-3
#> 4 10 -0.00357 0.00203 0.00560 -23.1 6.95e-8 7.03 -3.93e-3
#> # … with 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
# accumulation curves
curves_res <- curves %>% filter(N > 1)
curves_res <- map_dfr(split(curves_res, curves_res$N), ~{
.x$value = c(scale(.x$`accessory size`, scale = FALSE, center = FALSE))
return(.x)
})
curves_res <- curves_res %>% pivot_wider(id_cols = c("rep", "N"), names_from = "error_rate",
values_from = "value")
curve_test <- map_dfr(4:7, ~{
broom::tidy(ks.test(curves_res$`0`, unlist(curves_res[, .x]), alternative = "two.sided")) %>%
add_column(error_rate = colnames(curves_res)[[.x]])
})
curve_test
#> # A tibble: 4 × 5
#> statistic p.value method alternative error_rate
#> <dbl> <dbl> <chr> <chr> <chr>
#> 1 0.251 6.46e-14 Two-sample Kolmogorov-Smirnov test two-sided 1
#> 2 0.398 0 Two-sample Kolmogorov-Smirnov test two-sided 2
#> 3 0.578 0 Two-sample Kolmogorov-Smirnov test two-sided 5
#> 4 0.711 0 Two-sample Kolmogorov-Smirnov test two-sided 10
# Heaps
heap_results$value <- scale(heap_results$estimate, scale = FALSE, center = FALSE)
heap_results$error_rate <- as.numeric(as.character(heap_results$error_rate))
heap_test <- do.call(rbind, heap_results %>% filter(error_rate > 0) %>% group_by(error_rate) %>%
group_map(~{
broom::tidy(t.test(heap_results$value[heap_results$error_rate == 0], .x$value,
alternative = "two.sided")) %>% add_column(error_rate = unique(.x$error_rate),
.before = 1)
}, .keep = TRUE))
# panstripe
pp_results$value <- scale(pp_results$estimate, scale = FALSE, center = FALSE)
pp_results$error_rate <- as.numeric(as.character(pp_results$error_rate))
panstripe_test <- do.call(rbind, pp_results %>% filter(error_rate > 0) %>% group_by(error_rate) %>%
group_map(~{
broom::tidy(t.test(pp_results$value[pp_results$error_rate == 0], .x$value,
alternative = "two.sided")) %>% add_column(error_rate = unique(.x$error_rate),
.before = 1)
}, .keep = TRUE))
panstripe_test
#> # A tibble: 4 × 11
#> error_rate estimate estimate1 estimate2 statistic p.value parameter conf.low
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.209 12.2 12.0 0.161 0.876 7.82 -2.80
#> 2 2 0.588 12.2 11.6 0.514 0.621 7.92 -2.05
#> 3 5 0.0664 12.2 12.1 0.0482 0.963 7.55 -3.15
#> 4 10 0.493 12.2 11.7 0.435 0.675 7.89 -2.13
#> # … with 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
tests <- tibble(method = rep(c("panicmage", "Zamani-Dahaj", "Collins", "accumulation\ncurve",
"Heaps", "panstripe"), c(nrow(pannic_test), nrow(zamani_test), nrow(collins_test),
nrow(curve_test), nrow(heap_test), nrow(panstripe_test))), t.statistic = c(pannic_test$statistic,
zamani_test$statistic, collins_test$statistic, curve_test$statistic, heap_test$statistic,
panstripe_test$statistic), p.value = c(pannic_test$p.value, zamani_test$p.value,
collins_test$p.value, curve_test$p.value, heap_test$p.value, panstripe_test$p.value),
error.rate = c(pannic_test$error_rate, zamani_test$error_rate, collins_test$error_rate,
curve_test$error_rate, heap_test$error_rate, panstripe_test$error_rate))
curves_res_subset <- curves_res %>% filter(N == 20)
pdf <- tibble(values = c(pp_results$value, pannic_res$value, zamani_res$estimate,
collins_res$value, heap_results$value, curves_res_subset$`0`, curves_res_subset$`1`,
curves_res_subset$`2`, curves_res_subset$`5`, curves_res_subset$`10`), error.rate = c(pp_results$error_rate,
pannic_res$error_rate, zamani_res$error_rate, collins_res$error_rate, heap_results$error_rate,
rep(c(0, 1, 2, 5, 10), each = nrow(curves_res_subset))), method = rep(c("panstripe",
"panicmage", "Zamani-Dahaj", "Collins", "Heaps", "accumulation\ncurve"), c(nrow(pp_results),
nrow(pannic_res), nrow(zamani_res), nrow(collins_res), nrow(heap_results), 5 *
nrow(curves_res_subset))))
pdf$error.rate <- factor(pdf$error.rate, levels = sort(unique(as.numeric(pdf$error.rate))))
pdf$method <- factor(pdf$method, levels = c("panstripe", "Zamani-Dahaj", "Collins",
"panicmage", "Heaps", "accumulation\ncurve"))
pdf$is.sig <- ifelse(paste(pdf$method, pdf$error.rate) %in% paste(tests$method, tests$error.rate)[tests$p.value <
0.05], "significant", "not significant")
ggplot(pdf, aes(x = error.rate, y = values, col = is.sig)) + geom_boxplot(outlier.colour = NA,
position = position_dodge(width = 0.8)) + geom_point(size = 2, position = position_dodge(width = 0.8)) +
theme_clean(base_size = 20) + facet_wrap(~method, scales = "free", nrow = 1) +
scale_color_manual(values = c("#4d4d4d", "#b2182b")) + theme(plot.background = element_blank(),
legend.background = element_blank(), axis.title.x = element_blank(), panel.spacing = unit(2,
"lines")) + # scale_alpha_discrete(guide = 'none') +
ylab("estimated parameter value") + labs(color = "")
ggsave("./figures/simulation_error_rate_summary.png", width = 17, height = 7)
ggsave("./figures/simulation_error_rate_summary.pdf", width = 17, height = 7)