library(tidyverse)
library(ggsci)

theme_set(theme_bw(base_size = 14))

plot_minigraph <- function(path, name) {
  sat_csv <- read_tsv(path, col_names = c("Genome", "Sample", "N")) %>%
    mutate(
      nth = row_number(),
      Genome = factor(Genome, levels = c("hg38", "HPRC", "GA4K"), )
    )

  sat_p <- sat_csv %>% ggplot() +
    geom_line(aes(x = nth, y = N), size = 0.5, alpha = 0.7) +
    geom_point(data = sat_csv, aes(x = nth, y = N, color = Genome), size = 0.5) +
    labs(
      title = "Sequences added to graph",
      y = "Number of new sequences",
      x = "Nth additional haploid genome"
    ) +
    scale_y_continuous(trans = "log2", breaks = c(1, 128, 512, 2048, 16384)) +
    scale_color_manual(values = c("black", pal_npg()(2)))

  ggsave(paste0("plots/", name, "_saturation.pdf"), sat_p, width = 10)
  sat_p
}

mg_update <- read_tsv("data/graph_update.tsv", col_names = c("Dataset", "Assembly", "nodes")) %>%
  mutate(N = row_number())
mg_update_bp <- read_delim("data/cumulative_sequence_growth.csv", col_names = c("N", "bp"), delim = " ")

left_join(mg_update_bp, mg_update) %>%
  arrange(N) %>%
  mutate(cum_bp = cumsum(bp),
         Dataset = factor(Dataset, levels = c("hg38", "HPRC", "GA4K"))) %>%
  ggplot(aes(x = N, y = cum_bp, color = Dataset)) +
  geom_point() +
  scale_y_continuous(breaks = c(1, 3, 7) * 1e8) +
  scale_color_manual(values = c("black", pal_npg()(2))) +
  labs(
    title = "Growth of the GA4K pangenome",
    x = "Nth genome",
    y = "Number of non-reference bp"
  )

ggsave("plots/graph_update_saturation_np.pdf", width = 7, height = 7)
ggsave("plots/graph_update_saturation_np.png", width = 7, height = 7)
