We imputed chromosome 21 using GLIMPSE for sequence data and IMPUTE5 for array data. We recompute concordance on chromosome 21 from the original data and the new results.
# load in requisite packages
library(tidyverse)
library(ggthemes)
library(patchwork)
library(rms)
library(glue)
library(knitr)
library(kableExtra)
# populations
pops = read_delim("../data_revisions/all_samples_with_pops.list", delim = " ")
# load in precomputed data
concordance = read_delim("../data_revisions/all-gc-counts.txt",
col_names = c("cell_line", "sample", "stat_type", "set_id", "af_midpoint", "rr_hom_matches", "ra_het_matches", "aa_hom_matches",
"rr_hom_mismatches", "ra_het_mismatches", "aa_hom_mismatches", "dosage_rsquared", "n_genotypes", "site_type", "tech", "imputation_program"),
delim = " ") %>%
mutate(
# characterize into ilmn 0.5x, 1x, BGI, array
Experiment = case_when(
grepl("-0-2$", sample) ~ "Experiment D",
grepl("-0-[0-9]-[01]-0$", sample) ~ "Experiment A",
grepl("-1-[0-9]-[01]-0$", sample) ~ "Experiment B",
grepl("_[1-3]$", sample) ~ "Experiment E (array)",
grepl("-1$", sample) ~ "ilmn_1x_repl",
)
) %>%
inner_join(pops, by = "cell_line") %>%
mutate(
nrc_numerator = (ra_het_matches + aa_hom_matches),
nrc_denominator = (ra_het_matches + aa_hom_matches + rr_hom_mismatches + ra_het_mismatches + aa_hom_mismatches),
nrc = nrc_numerator / nrc_denominator
)
We can summarize results and plot, like so:
concordance_summary = concordance %>%
group_by(af_midpoint, Experiment, stat_type, site_type, tech, imputation_program, super_pop) %>%
summarise(mean_nrc = mean(nrc)) %>%
rename(
# rename to human-readable
`Super population` = super_pop,
)
## `summarise()` regrouping output by 'af_midpoint', 'Experiment', 'stat_type', 'site_type', 'tech', 'imputation_program' (override with `.groups` argument)
concordance_summary %>%
filter(stat_type == "GCsAF", site_type == "allsites") %>%
ggplot(aes(x = af_midpoint, y = mean_nrc, color = Experiment)) +
facet_grid(~`Super population`) + geom_point() +
geom_line(aes(linetype = factor(imputation_program))) + scale_color_tableau() + theme_few() +
labs(
title = "Average NRC by non-reference allele frequency, unfiltered on chr21",
x = "Non-Reference Allele Frequency",
y = "Non-Reference Concordance"
) + scale_x_log10()
ggsave("../paper/src/figs/revisions-chr21-nrc.pdf", width = 10, height = 7)
from which we see that the qualitative trend of the sequence imputation being more accurate than the array imputation holding, with a fixed effect of the imputation program being used given an assay type/Experiment.
Interestingly, if one looks at filtered SNPs, the gap between concordance for the two array imputation algorithms (IMPUTE5 vs minimac4) widens significantly:
concordance_summary %>%
filter(stat_type == "GCsAF", site_type == "passing") %>%
ggplot(aes(x = af_midpoint, y = mean_nrc, color = Experiment)) +
facet_grid(~`Super population`) + geom_point() +
geom_line(aes(linetype = factor(imputation_program))) + scale_color_tableau() + theme_few() +
labs(
title = "Average NRC by non-reference allele frequency, filtered for high confidence on chr21",
x = "Non-Reference Allele Frequency",
y = "Non-Reference Concordance"
) + scale_x_log10()
particularly at higher allele frequencies.