library(GenomicRanges)
library(tidyverse)
library(parallel)

args = commandArgs(trailingOnly = TRUE)

cpg_bins_bed <- read_tsv("data/chm13v2_200bp_bins.tsv") %>%
  filter(!str_detect(seqnames, "chr[MXY]")) %>%
  group_by(seqnames) %>%
  group_split() %>%
  lapply(function(b) b %>%
    makeGRangesFromDataFrame(seqnames.field = "seqnames", start.field = "start", end.field = "end")
  )

cpgs <- read_delim("gfa/cpg_index_xaf_strand.csv.gz",
  delim = " ", skip = 1,
  col_names = c("node", "pos", "strand")
)


gfa_bed <- read_tsv("gfa/graph_xaf.bed", col_names = c("chrom", "start", "end", "node", "rank")) %>%
  filter(rank == 0)

cpg_bed_path = args[1]
## cpg_bed_path = "test/UNMC-000052-03.GRCh38.deepvariant.haplotagged.ref.csv"

cpg_bed <- read_delim(cpg_bed_path,
  delim = " ",
  col_names = c("node", "pos", "strand", "mc", "score")
)

cpgs_gw <- left_join(cpgs, cpg_bed) %>%
  replace_na(list(mc = 0)) %>%
  select(-c(strand))

cpg_ref_bed <- left_join(cpgs_gw, gfa_bed) %>%
  mutate(
    start = start + pos,
    end = start
  ) %>%
  filter(!str_detect(chrom, "chr[MXY]")) %>%
    group_by(chrom) %>%
    group_split() %>%
    lapply(function(b) b %>%
      makeGRangesFromDataFrame(
        seqnames.field = "chrom",
        start.field = "start",
        end.field = "end",
        keep.extra.columns = T
      )
    )

rm(cpg_bed)
rm(cpgs_gw)
rm(gfa_bed)
rm(cpgs)
gc()

cpg_bins <- lapply(1:22, function(i) {
  o <- findOverlaps(cpg_bins_bed[[i]], cpg_ref_bed[[i]])
  tibble(chrom = seqlevels(cpg_ref_bed[[i]][queryHits(o)]),
    start = start(cpg_bins_bed[[i]][queryHits(o)]),
    end = end(cpg_bins_bed[[i]][queryHits(o)]),
    bin = factor(queryHits(o),
                 levels = 1:length(cpg_bins_bed[[i]])),
    score = cpg_ref_bed[[i]]$score[subjectHits(o)],
    mc = cpg_ref_bed[[i]]$mc[subjectHits(o)]) %>%
    group_by(bin, chrom, start, end) %>%
      summarise(
        avg_score = mean(score, na.rm = T),
        avg_mc = mean(mc, na.rm = T)
      )
  })

cpg_bins %>%
  bind_rows() %>%
  ungroup() %>%
  arrange(chrom, start, end) %>%
  select(-c(bin)) %>%
  write_tsv(args[2])
