library(tidyverse)
library(ggsci)
library(repeatR)
library(GenomicRanges)
library(geomtextpath)
library(scales)
library(cowplot)

theme_set(theme_bw(base_size = 14))

## convert repeat masker output to BED
repgr <- lapply(list.files("data/repmask/", full.names = T), read_rm) %>%
  bind_rows() %>%
  makeGRangesFromDataFrame(
    seqnames.field = "qname",
    start.field = "qstart",
    end.field = "qend",
    keep.extra.columns = T
  ) %>%
  as_tibble() %>%
  select(seqnames, start, end, tclass, tname) %>%
  write_tsv("data/repeatmask.bed", col_names = F)

load_mc_mask <- function(p, cpgs) {
read_tsv(p,
  col_names = c(
    "node", "pos", "end", "strand", "freq", "node2", "start2", "end2", "tclass",
    "tname"
  ), na = c(-1, ".", "", "NA")
) %>%
  replace_na(replace = list(tclass = "None")) %>%
  left_join(cpgs) %>%
  filter(cstrand == strand) %>%
  select(-cstrand) # remove signal that is on the wrong strand
}

cpgs <- read_delim("gfa/cpg_index_xaf_strand.csv.gz",
  delim = " ", skip = 1,
  col_names = c("node", "pos", "cstrand")
)

rep_sv_mc <- load_mc_mask("data/sv_mc_pangenome_n5_repmask_join.bed.gz", cpgs)
rep_ref_mc <- load_mc_mask("data/ref_mc_pangenome_n5_repmask_join.bed.gz", cpgs)

# For ref CpGs
repmc_ref_count <- rep_ref_mc %>%
  count(tclass)

repmc_ref_p <- repmc_ref_count %>%
  filter(n > 10000) %>%
  ggplot() +
  geom_col(aes(y = reorder(tclass, n), x = n / 2)) +
  labs(x = "Number of 5mCpGs", y = "Repeat")
ggsave("plots/cpg_ref_repeats.pdf", repmc_ref_p)

rep_ref_mc_freq_p <- rep_ref_mc %>%
  ggplot() +
  geom_histogram(aes(x = freq)) +
  ## scale_y_log10(breaks = c(1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8)) +
  labs(x = "5mCpG frequency", title = "Frequency of reference 5mCpGs")
ggsave("plots/cpg_ref_freq_histogram.pdf", rep_ref_mc_freq_p)

rep_ref_mc_freq_tclass_p <- rep_ref_mc %>%
  ggplot() +
  stat_ecdf(aes(x = freq)) +
  scale_x_continuous(breaks = c(0, 200, 400)) +
  labs(x = "5mCpG frequency", title = "Frequency of reference 5mCpGs") +
  facet_wrap(~tclass)
ggsave("plots/cpg_ref_freq_ecdf_histogram.pdf", rep_ref_mc_freq_tclass_p, height = 12, width = 12)

rep_ref_mc_freq_tclass_subset_p <- rep_ref_mc %>%
  group_by(tclass) %>%
  mutate(N = n()) %>%
  filter(N > 200000) %>%
  mutate(Repeat = case_when(
           str_detect(tclass, "ERV") ~ "ERV",
           str_detect(tclass, "LINE") ~ "LINE",
           str_detect(tclass, "Satellite") ~ "Sattelite",
           str_detect(tclass, "SINE") ~ "SINE",
           str_detect(tclass, "SVA") ~ "SVA",
           T ~ tclass
         )) %>%
  ggplot() +
    stat_ecdf(aes(x = freq, color = Repeat, group = Repeat)) +
    scale_x_continuous(breaks = c(0, 200, 400)) +
    labs(x = "5mCpG frequency", title = "Frequency of non-reference 5mCpGs") +
    scale_color_npg()
## ggsave("plots/cpg_freq_ecdf_subset_histogram.pdf", direct.label(rep_ref_mc_freq_tclass_subset_p, "lines2"))
ggsave("plots/cpg_ref_freq_ecdf_subset_histogram.pdf", rep_ref_mc_freq_tclass_subset_p)

# For SV CpGs
rep_sv_mc_count <- rep_sv_mc %>%
  count(tclass)

rep_sv_mc_p <- rep_sv_mc_count %>%
  filter(n > 10000) %>%
  ggplot() +
  geom_col(aes(y = reorder(tclass, n), x = n/2 )) +
  scale_x_continuous(labels = label_scientific()) +
  labs(x = "Number of 5mCpGs", y = "Repeat", title = "Repeats in SV 5mCpGs",
       subtitle = "SV CpGs with methylation (5mCpGs)")
ggsave("plots/cpg_sv_repeats.pdf", rep_sv_mc_p)


rep_sv_mc_freq_tclass_p <- rep_sv_mc %>%
  ggplot() +
  stat_ecdf(aes(x = freq)) +
  scale_x_continuous(breaks = c(0, 200, 400)) +
  labs(x = "5mCpG frequency", title = "Frequency of SV 5mCpGs") +
  facet_wrap(~tclass)
ggsave("plots/cpg_sv_freq_ecdf_histogram.pdf", rep_sv_mc_freq_tclass_p, height = 12, width = 12)

rep_sv_mc_freq_tclass_subset_p <- rep_sv_mc %>%
  group_by(tclass) %>%
  mutate(N = n()) %>%
  filter(N > 20000) %>%
  mutate(Repeat = case_when(
    str_detect(tclass, "ERV") ~ "ERV",
    str_detect(tclass, "LINE") ~ "LINE",
    str_detect(tclass, "Satellite") ~ "Sattelite",
    str_detect(tclass, "SINE") ~ "SINE",
    str_detect(tclass, "SVA") ~ "SVA",
    T ~ tclass
  )) %>%
  ggplot() +
  stat_ecdf(aes(x = freq, color = Repeat, group = Repeat, label = Repeat, hjust = Repeat), geom = "textline") +
  scale_x_continuous(breaks = c(0, 200, 400)) +
  scale_hjust_manual(values = seq(0.3, 0.7, by = (0.7 - 0.3) / 8)) +
    labs(x = "Cohort 5mCpG count", title = "Frequency spectra of SV 5mCpGs", subtitle = "Stratified by repeat categories", y = "Cumulative distribution") +
    scale_color_npg() +
    theme(legend.position = "none")

ggsave("plots/cpg_sv_freq_ecdf_subset.pdf", rep_sv_mc_freq_tclass_subset_p, width = 7, height = 7)
ggsave("plots/cpg_sv_freq_ecdf_subset.png", rep_sv_mc_freq_tclass_subset_p, width = 7, height = 7)

mc_rep_count <- bind_rows(
  rep_sv_mc_count %>%
    mutate(
      Methylome = "SV",
      Percent = n / (rep_sv_mc_count %>% pull(n) %>% sum()) * 100
    ),
  repmc_ref_count %>%
    mutate(
      Methylome = "Reference",
      Percent = n / (repmc_ref_count %>% pull(n) %>% sum()) * 100
    )
)

write_tsv(mc_rep_count, "data/mc_repeats.tsv")

mc_rep_count <- read_tsv("data/mc_repeats.tsv")
repeats_enrichment_p <- mc_rep_count %>%
  top_n(n = 30) %>%
  group_by(tclass) %>%
  mutate(Both = n() > 1) %>%
  filter(Both) %>%
  ggplot() +
  geom_col(aes(y = reorder(tclass, Percent), x = Percent, fill = Methylome), position = "dodge") +
  labs(x = "Percent of 5mCpGs", y = "Repeat category", title = "Proportion of 5mCpGs in repeats",
       subtitle = "Reference versus SV 5mCpGs") +
  theme(legend.position = "top") +
  scale_fill_npg()
ggsave("plots/cpg_sv_ref_repeats.pdf")

rep_sv_genotype <- load_mc_mask("data/sv_genotypes_pangenome_n5_repmask_join.bed.gz", cpgs)

rep_sv_genotype_count <- rep_sv_genotype %>%
  count(tclass)

rep_sv_genotype_p <- rep_sv_genotype_count %>%
  filter(n > 20000) %>%
  ggplot() +
  geom_col(aes(y = reorder(tclass, n), x = n / 2)) +
  labs(
    x = "Number of CpGs", y = "Repeat", title = "Repeats in SV CpGs",
    subtitle = "Genotyped SV CpGs"
  )
ggsave("plots/cpg_sv_genotype_repeats.pdf", rep_sv_genotype_p)

rep_p <- left_join(rep_sv_genotype_count, rep_sv_mc_count, by = "tclass") %>%
  filter((n.x + n.y) / 2 > 10000) %>%
  mutate(Methylated = n.y,
         Genotyped = n.x - n.y) %>%
  select(tclass, Methylated, Genotyped) %>%
  pivot_longer(c(Methylated, Genotyped), names_to = "CpG", values_to = "N") %>%
  ggplot() +
  geom_col(aes(y = reorder(tclass, N), x = N / 2, fill = CpG)) +
  labs(x = "Number of CpGs", y = "Repeat category", title = "Repeats in SV CpGs",
       subtitle = "Genotyped and methylated SV CpGs") +
  theme(legend.position = "top") +
  scale_fill_npg()
ggsave("plots/cpg_repeats.pdf", rep_p)


sv_repeats_rates <- left_join(rep_sv_genotype,
  rep_sv_mc,
  by = c("node", "pos", "strand", "tclass")
) %>%
  replace_na(replace = list(freq.y = 0)) %>%
  group_by(tclass) %>%
  mutate(Methylated = freq.y / freq.x * 100)


sv_repeats_quantiles <- left_join(rep_sv_genotype,
  rep_sv_mc,
  by = c("node", "pos", "strand", "tclass")
) %>%
  replace_na(replace = list(freq.y = 0)) %>%
  group_by(tclass) %>%
  mutate(Methylated = freq.y / freq.x * 100) %>%
  summarize(
    Q1 = quantile(Methylated, 0.25),
    Q2 = quantile(Methylated, 0.50),
    Q3 = quantile(Methylated, 0.75),
    Mean = mean(Methylated),
    N = n() / 2
  ) %>%
  mutate(CpG = "SV")

rep_ref_genotype <- load_mc_mask("data/ref_genotypes_pangenome_n5_repmask_join.bed.gz", cpgs)


ref_repeats_rates <- left_join(rep_ref_genotype, rep_ref_mc, by = c("node", "pos", "strand", "tclass")) %>%
  replace_na(replace = list(freq.y = 0)) %>%
  group_by(tclass) %>%
  mutate(Methylated = freq.y / freq.x * 100)


tnames <- sv_repeats_rates %>%
  pull(tclass) %>%
  unique()

wilcox_tests <- lapply(tnames, function(tname1) {
  tidy(wilcox.test(sv_repeats_rates %>% filter(tclass == tname1) %>% pull(Methylated),
    ref_repeats_rates %>% filter(tclass == tname1) %>% pull(Methylated),
    alternative = "two.sided"
  )) %>% mutate(tname = tname1)
}) %>%
  bind_rows() %>%
  mutate(adj.p.value = p.adjust(p.value, method = "fdr"))

t_tests <- lapply(tnames, function(tname1) {
  tidy(t.test(sv_repeats_rates %>% filter(tclass == tname1) %>% pull(Methylated),
    ref_repeats_rates %>% filter(tclass == tname1) %>% pull(Methylated),
    alternative = "two.sided"
  )) %>% mutate(tname = tname1)
}) %>%
  bind_rows() %>%
  mutate(adj.p.value = p.adjust(p.value, method = "fdr"))


wilcox.test(sv_repeats_rates %>% filter(tclass == "None") %>% pull(Methylated),
            ref_repeats_rates %>% filter(tclass == "None") %>% pull(Methylated),
            alternative = "two.sided")

ref_repeats_quantiles <- left_join(rep_ref_genotype, rep_ref_mc, by = c("node", "pos", "strand", "tclass")) %>%
  replace_na(replace = list(freq.y = 0)) %>%
  group_by(tclass) %>%
  mutate(Methylated = freq.y / freq.x * 100) %>%
  summarize(Q1 = quantile(Methylated, 0.25),
            Q2 = quantile(Methylated, 0.50),
            Q3 = quantile(Methylated, 0.75),
            Mean = mean(Methylated),
            N = n()/2) %>%
  mutate(CpG = "Reference")

write_tsv(bind_rows(
  sv_repeats_quantiles,
  ref_repeats_quantiles), "data/rep_percent.tsv.gz")


rep_percent_df <- read_tsv("data/rep_percent.tsv.gz") %>%
  group_by(tclass) %>%
  mutate(SV_Q2 = dplyr::first(Q2))

tclass_set <- rep_percent_df %>% filter(CpG == "SV", N < 1e4) %>% pull(tclass) %>% unique()
rep_percent_df <- rep_percent_df %>% filter(tclass %in% tclass_set)

tclass_order <- rep_percent_df %>% arrange(SV_Q2) %>% filter(CpG == "SV") %>% pull(tclass)

rep_percent_df <- rep_percent_df %>% mutate(tclass = factor(tclass, levels = tclass_order))

rep_percent_p <- rep_percent_df %>%
  ggplot() +
  geom_errorbar(aes(xmin = Q1, xmax = Q3, y = tclass, color = CpG), alpha = 0.5) +
  geom_point(aes(x = Q2, y = tclass, color = CpG)) +
  geom_point(aes(x = Mean, y = tclass, color = CpG), shape = 5) +
  labs(
    x = "CpG methylation rate (%)", y = "Repeat category",
    title = "Methylation rate across repeat categories",
    subtitle = "In small families with less than 10,000 CpGs"
  ) + theme(legend.position = "top")

ggsave("plots/cpg_methylation_rate_repeats_small.pdf", rep_percent_p, height = 10, width = 7)
ggsave("plots/cpg_methylation_rate_repeats_small.png", rep_percent_p, height = 10, width = 7)

rep_percent_df <- read_tsv("data/rep_percent.tsv.gz") %>%
  group_by(tclass) %>%
  mutate(SV_Q2 = dplyr::first(Q2))

tclass_set <- rep_percent_df %>% filter(CpG == "SV", N > 1e4) %>% pull(tclass) %>% unique()
rep_percent_df <- rep_percent_df %>% filter(tclass %in% tclass_set)

tclass_order <- rep_percent_df %>% arrange(SV_Q2) %>% filter(CpG == "SV") %>% pull(tclass)

rep_percent_df <- rep_percent_df %>% mutate(tclass = factor(tclass, levels = tclass_order))

rep_percent_p <- rep_percent_df %>%
  ggplot() +
  geom_errorbar(aes(xmin = Q1, xmax = Q3, y = tclass, color = CpG), alpha = 0.5) +
  geom_point(aes(x = Q2, y = tclass, color = CpG)) +
  geom_point(aes(x = Mean, y = tclass, color = CpG), shape = 5) +
  labs(
    y = "Repeat category",
    x = "CpG methylation rate (%)", y = "Repeat",
    title = "Methylation across repeat categories",
    subtitle = "In large families with at least 10,000 SV CpGs") +
  theme(legend.position = "top")

ggsave("plots/cpg_methylation_rate_repeats.pdf", rep_percent_p)
ggsave("plots/cpg_methylation_rate_repeats.png", rep_percent_p)

repeats_fig_p <- plot_grid(rep_p,
                           repeats_enrichment_p,
                           rep_sv_mc_freq_tclass_subset_p,
                           rep_percent_p,
                           nrow = 2, ncol = 2, labels = "AUTO")
ggsave("figures/repeats.pdf", repeats_fig_p, height = 12, width = 12)
ggsave("figures/repeats.png", repeats_fig_p, height = 12, width = 12)
