library(tidyverse)
library(ggsci)
library(repeatR)
library(GenomicRanges)
library(geomtextpath)
library(parallel)
library(scales)
library(cowplot)
library(ggbreak)

theme_set(theme_bw(base_size = 14))

## n_samples <- max(n_alt_cpgs$node_freq)
n_samples <- 435
n_bins <- 30

sv_mc_freq <- read_delim("data/sv_mc_pangenome_n5.csv.gz", col_names = c("node", "pos", "strand", "freq"), delim = " ") %>%
  mutate(pos = case_when(strand == "+" ~ pos + 1, strand == "-" ~ pos)) %>%
  group_by(node, pos) %>%
  summarise(freq = max(freq)) %>%
  arrange(freq) %>%
  group_by(freq) %>%
  summarise(total_cpgs = n()) %>%
  mutate(Freq = ntile(freq, n = n_bins) * (n_samples / n_bins)) %>%
  group_by(Freq) %>%
  summarise(total_cpgs = sum(total_cpgs)) %>%
  select(Freq, total_cpgs)
write_tsv(sv_mc_freq, "data/sv_mc_freq.tsv.gz")

## PANEL
sv_mc_freq <- read_tsv("data/sv_mc_freq.tsv.gz")
sv_mc_freq_p <- sv_mc_freq %>%
  ggplot() +
  geom_bar(aes(x = Freq, y = total_cpgs), width = n_samples / n_bins, stat = "identity") +
  ## geom_histogram(aes(x = freq)) +
  scale_y_continuous(labels = label_scientific()) +
  labs(x = "5mCpG frequency", title = "Frequency of non-reference 5mCpGs",
       y = "Number of 5mCpGs",
       subtitle = "Within the 435 methylomes")
ggsave("plots/cpg_sv_freq_histogram.pdf", sv_mc_freq_p)

## PANEL
## freq_alt_cpgs <- read_tsv("data/freq_alt_cpgs.tsv.gz")
freq_alt_cpgs <- read_delim("data/sv_genotypes_pangenome_n5.csv.gz", col_names = c("node", "pos", "strand", "freq"), delim = " ") %>%
  mutate(pos = case_when(strand == "+" ~ pos + 1, strand == "-" ~ pos)) %>%
  group_by(node, pos) %>%
  summarise(freq = max(freq)) %>%
  arrange(freq) %>%
  group_by(freq) %>%
  summarise(total_cpgs = n()) %>%
  mutate(Freq = ntile(freq, n = n_bins) * (n_samples / n_bins)) %>%
  group_by(Freq) %>%
  summarise(total_cpgs = sum(total_cpgs)) %>%
  select(Freq, total_cpgs)

alt_cpg_freq_p <- freq_alt_cpgs %>%
  ggplot() +
  geom_bar(aes(x = Freq, y = total_cpgs), width = n_samples / n_bins, stat = "identity") +
  scale_x_continuous(breaks = c(1, 100, 200, 300, 400)) +
  labs(
    x = "Frequency", y = "Number of CpGs",
    title = "Frequency of non-reference CpGs",
    subtitle = "Within the 435 methylomes"
  )

ggsave("plots/pangenome_alt_cpgs.png", alt_cpg_freq_p)
ggsave("plots/pangenome_alt_cpgs.pdf", alt_cpg_freq_p)

# PANEL
pangenome_alt_cpgs_p <- bind_rows(freq_alt_cpgs %>% mutate(CpG = "Genotyped"),
                                  sv_mc_freq %>% mutate(CpG = "Methylated")) %>%
  select(Freq, CpG, total_cpgs) %>%
    mutate(CpG = factor(CpG, levels = c("Genotyped", "Methylated"))) %>%
    ggplot() +
    geom_bar(aes(x = Freq, y = total_cpgs), stat = "identity", width = n_samples / n_bins, position = "stack", just = 1) +
    ## geom_path(aes(x = Freq, y = N, color = CpG), stat = "identity", width = n_samples / n_bins) +
  ## scale_y_log10() +
  scale_fill_npg() +
  scale_color_npg() +
  scale_x_continuous(breaks = c(1, 100, 200, 300, 400)) +
  labs(x = "Frequency", y = "Number of CpGs",
       title = "SV CpGs in 435 methylomes",
       subtitle = "Genotype and methylation frequency distributions") +
  facet_wrap(~CpG, dir = "v") +
  theme(legend.position = "top")

sv_mc <- right_join(read_delim("data/sv_mc_pangenome_n5.csv.gz", col_names = c("node", "pos", "strand", "mc_freq"), delim = " "),
                   read_delim("data/sv_genotypes_pangenome_n5.csv.gz", col_names = c("node", "pos", "strand", "cpg_freq"), delim = " ")) %>%
  replace_na(list(mc_freq = 0))

ref_mc <- right_join(
  read_delim("data/ref_mc_pangenome_n5.csv.gz", col_names = c("node", "pos", "strand", "mc_freq"), delim = " "),
  read_delim("data/ref_genotypes_pangenome_n5.csv.gz", col_names = c("node", "pos", "strand", "cpg_freq"), delim = " ")
) %>%
  replace_na(list(mc_freq = 0))


ref_mc %>%
  ggplot(aes(x = cpg_freq)) +
  geom_histogram() +
  labs(title = "Population frequency of reference CpGs",
       x = "Frequency", y = "Number of CpGs")
ggsave("plots/reference_cpg_frequency_histogram.pdf", width = 7, height = 7)
ggsave("plots/reference_cpg_frequency_histogram.png", width = 7, height = 7)

ref_mc_freq_bins <- ref_mc %>%
  mutate(freq_bin = cut(cpg_freq,
    breaks = c(seq(1, n_samples, by = 15)),
    ordered_result = T),
    mc_rate = mc_freq / cpg_freq) %>%
    group_by(freq_bin) %>%
    summarise(n0 = sum(mc_rate == 0), n = n(), avg_rate = mean(mc_rate)) %>%
    na.omit()

sv_mc_freq_bins <- sv_mc %>%
  mutate(freq_bin = cut(cpg_freq,
    breaks = c(seq(1, n_samples, by = 15)),
    ordered_result = T),
    mc_rate = mc_freq / cpg_freq) %>%
  group_by(freq_bin) %>%
  summarise(n0 = sum(mc_rate == 0), n = n(), avg_rate = mean(mc_rate)) %>%
  na.omit()

bind_rows(
  ref_mc_freq_bins %>% mutate(CpG = "Reference"),
  sv_mc_freq_bins %>% mutate(CpG = "SV")
) %>%
  mutate(freq_bin = as.factor(freq_bin)) %>%
  write_tsv("data/unmethylated_frequency_bins.tsv.gz")

freq_bins <- read_tsv("data/unmethylated_frequency_bins.tsv.gz", col_types = "fiid")
freq_bins %>%
  ggplot(aes(x = as.double(freq_bin), y = n0 / n, color = CpG, fill = CpG)) +
  geom_smooth() +
  geom_point() +
  scale_x_continuous(breaks = 1:28, labels = freq_bins %>% filter(CpG == "Reference") %>% pull(freq_bin)) +
  scale_y_continuous(limits = c(0, 1)) +
  theme(axis.text.x = element_text(angle = -90, vjust = 0.5),
        legend.position = "top") +
  labs(
    x = "CpG frequency bin", y = "Fraction never methylated",
    title = "Unmethylated CpGs among rare and common CpGs"
  ) +
  scale_color_npg() +
  scale_fill_npg()
ggsave("plots/never_methylated_freq.png")
ggsave("plots/never_methylated_freq.pdf")

freq_bins %>%
  ggplot(aes(x = as.double(freq_bin), y = avg_rate, color = CpG, fill = CpG)) +
  geom_smooth() +
  geom_point() +
  scale_x_continuous(breaks = 1:28, labels = freq_bins %>% filter(CpG == "Reference") %>% pull(freq_bin)) +
  scale_y_continuous(limits = c(0, 1)) +
  theme(axis.text.x = element_text(angle = -90, vjust = 0.5), legend.position = "top") +
  labs(
    x = "CpG frequency bin", y = "Average methylation rate",
    title = "Methylation rate among rare and common CpGs"
  ) +
  scale_color_npg() +
  scale_fill_npg()
ggsave("plots/mean_methylated_rate_freq.png")
ggsave("plots/mean_methylated_rate_freq.pdf")

# PANEL

mc_rate_p <- bind_rows(sv_mc %>%
                       filter(strand == "+") %>%
                       mutate(CpG = "SV-CpG"),
                       ref_mc %>%
                       filter(strand == "+") %>%
                       mutate(CpG = "Reference CpG")) %>%
  mutate(mc_rate = mc_freq / cpg_freq) %>%
  ## filter(mc_rate <= 1) %>%
  ggplot() +
  geom_histogram(aes(x = mc_rate * 100), bins = 20) +
  labs(
    title = "Methylation rate of CpGs",
    subtitle = "Adjusted for allele frequency",
    x = "Methylation rate (%)"
  ) + facet_wrap(~CpG, scales = "free_y", dir = "v")
ggsave("plots/pangenome_alt_cpgs_methylation_rate.png", mc_rate_p,
       width = 7, height = 7)
ggsave("plots/pangenome_alt_cpgs_methylation_rate.pdf", mc_rate_p,
       width = 7, height = 7)

load_saturation <- function(files) {
mclapply(
  files,
  function(p) {
    read_delim(p,
      col_names = c("node", "pos", "strand", "genome")
    ) %>%
      ## mutate(genome = 443 - genome) %>%
      count(genome) %>%
      mutate(cumN = cumsum(n) / 2) %>%
      mutate(Change = cumN - dplyr::lag(cumN))
  }, mc.cores = 7, mc.preschedule = T
) %>%
  bind_rows() %>%
  group_by(genome) %>%
  summarise(
    meanN = mean(cumN), sdN = sd(cumN),
    meanC = mean(Change, na.rm =T), sdC = sd(Change, na.rm = T)
  ) %>%
  mutate(
    minNY = meanN - 1.96 * sdN,
    maxNY = meanN + 1.96 * sdN,
    minCY = meanC - 1.96 * sdC,
    maxCY = meanC + 1.96 * sdC)

}

cpg_sat_df <- load_saturation(
  list.files("data/methylome", pattern = "sv_cpg_n5.*.csv.gz$", full.names = T)
)


svmcsatdf <- load_saturation(
  list.files("data/methylome", pattern = "sv_methylome_n5.*.csv.gz$", full.names = T)
)


refcpgsatdf <- load_saturation(
  list.files("data/methylome/", pattern = "ref_cpg_n5.*.csv.gz", full.names = T)
)

refmcsatdf <- load_saturation(
  list.files("data/methylome/", pattern = "*ref_methylome_n5.*.csv.gz", full.names = T)
)

mcsatdf <- bind_rows(
  svmcsatdf %>% mutate(CpGs = "SV 5mCpGs"),
  refcpgsatdf %>% mutate(CpGs = "CHM13v2 CpGs"),
  cpg_sat_df %>% mutate(CpGs = "SV CpGs"),
  refmcsatdf %>% mutate(CpGs = "CHM13v2 5mCpGs")
)%>% filter(genome <= 435)
write_tsv(mcsatdf, "data/methylome_saturation.tsv.gz")

## PANEL
mcsatdf <- read_tsv("data/methylome_saturation.tsv.gz") %>%
  mutate(CpGs = factor(CpGs, levels = c("CHM13v2 CpGs", "CHM13v2 5mCpGs", "SV CpGs", "SV 5mCpGs")))
mcsat_p <- mcsatdf  %>% ggplot() +
  geom_path(aes(x = genome, y = meanN, color = CpGs, group = CpGs)) +
  ## geom_textline(aes(x = genome, y = meanN, color = CpGs, label = CpGs), size = 6,
  ##               hjust = "jitter") +
  geom_ribbon(aes(x = genome, ymin = minNY, ymax = maxNY, fill = CpGs, group = CpGs), alpha = 0.3) +
  labs(x = "Nth methylome", y = "Cumulative CpGs", title = "Size of the panmethylome", subtitle = "Reference and SV CpGs and 5mCpGs") +
  scale_y_continuous(breaks = c(7.4e5, 4.06e6, 24.2e6, 28.3e6, 30.5e6, 32e6), labels = scientific) +
  scale_y_break(c(4.0e6, 24.0e6), scales = "free") +
  theme(
    legend.position = "none",
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    axis.text.y.right = element_blank(),
    axis.line.y.right = element_blank(),
    axis.ticks.y.right = element_blank()) +
  scale_color_npg() +
  scale_fill_npg()
ggsave("plots/methylome_saturation.pdf", mcsat_p + theme(legend.position = "right"))


## PANEL
mcsat_change_p <- mcsatdf %>% ggplot() +
  ## geom_point(aes(x = genome, y = meanC, color = CpGs, group = CpGs), alpha = 0.7) +
  ## geom_ribbon(aes(x = genome, ymin = minCY, ymax = maxCY, fill = CpGs, group = CpGs), alpha = 0.3) +
  geom_smooth(aes(x = genome, y = meanC, color = CpGs, group = CpGs), formula = y ~ log(x), show.legend = F) +
  labs(x = "Nth methylome", y = "Mean new CpGs (permutations)", title = "Saturation of the panmethylome", subtitle = "Reference and SV CpGs and 5mCpGs") +
    scale_y_log10() +
    theme(
      legend.position = "top", panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank(),
      panel.grid.minor.y = element_blank()
    ) + scale_color_npg() +
    scale_fill_npg()
ggsave("plots/methylome_change.pdf", mcsat_change_p, width = 8)
ggsave("plots/methylome_change.png", mcsat_change_p, width = 8)

legend_sat <- get_plot_component(mcsat_p +
                         guides(color = guide_legend(nrow = 1)) +
                         theme(legend.position = "bottom"), 'guide-box-bottom',
                         return_all = T)

sv_cpg_pic <- cowplot::ggdraw() + cowplot::draw_image("figures/Picture2.pdf", scale = 1)

## FIGURE
panmethylome_fig <- plot_grid(
  sv_cpg_pic,
  plot_grid(print(mcsat_p),
            mcsat_change_p,
            ncol = 2, nrow = 1, labels = c("B", "C")),
  legend_sat,
  plot_grid(pangenome_alt_cpgs_p, mc_rate_p,
            ncol = 2, nrow = 1, labels = c("D", "E")),
  ncol = 1,
  labels = c("A", "", "", ""),
  nrow = 4, rel_heights = c(0.45, 1, 0.1, 1)
)
ggsave("figures/panmethylone.pdf", panmethylome_fig, width = 12, height = 14)
ggsave("figures/panmethylone.png", panmethylome_fig, width = 12, height = 14)
