library(tidyverse)
library(parallel)
library(GenomicRanges)
library(ggrastr)
library(scales)
library(cowplot)
library(ggsci)
library(ggbeeswarm)
library(plyranges)
library(repeatR)

theme_set(theme_bw(base_size = 14))

svs_gr <- read_tsv("mg_methyl.tsv.gz") %>%
  select(chrom, start, end) %>%
  mutate(chrom = str_remove(chrom, "chr")) %>%
  distinct() %>%
  makeGRangesFromDataFrame()

svs_df <- read_tsv("mg_methyl.tsv.gz") %>%
  select(chrom, start, end, Allele, Freq) %>%
  filter(!str_detect(chrom, "chr[XY]")) %>%
  mutate(chrom = as.numeric(str_remove(chrom, "chr")))

ref_alleles <- read_tsv("data/chm13v2.calls",
  lazy = F, progress = T, show_col_types = F,
  col_names = c("chrom", "start", "end", "source", "sink", "call")
) %>%
  separate("call", into = c("Path", "Length", "Strand", "Contig", "Contig_start", "Contig_end"), sep = ":") %>%
  filter(!str_detect(chrom, "chr[XY]")) %>%
  mutate(Ref = T, chrom = str_remove(chrom, "chr"), chrom = as.numeric(chrom)) %>%
  select(chrom, start, end, Path, Ref)


snps <- mclapply(list.files("data/qtl/",
  pattern = "top_snps_chunk*",
  full.names = T), function(p) read_tsv(p) %>%
                 separate(binId,
                          into = c("bin_chrom", "bin_start", "bin_end"),
                          convert = T, sep  = ":") %>%
                 separate(snpId, into = c("snp_chrom", "snp_pos"), convert = T, sep = ":"), mc.cores = 1) %>%
  bind_rows()

svs <- lapply(list.files("data/qtl/",
                         pattern = "top_qtls_MAF5.*.tsv.gz", full.names = T),
              function(p) {
                read_tsv(p) }) %>%
  bind_rows() %>%
  mutate(
    sv_chrom = as.numeric(str_remove(sv_chrom, "chr")),
    bin_chrom = as.numeric(str_remove(bin_chrom, "chr"))
  )
write_tsv(snps, "data/bins_snps.tsv.gz")
write_tsv(svs, "data/bins_svs.tsv.gz")

snps <- read_tsv("data/bins_snps.tsv.gz")
svs <- read_tsv("data/bins_svs.tsv.gz")


bins_df <- full_join(svs, snps, by = c("bin_chrom", "bin_start", "bin_end"), suffix = c(".sv", ".snp"))

bins_df <- left_join(bins_df, svs_df, by = c(
  "sv_chrom" = "chrom", "sv_start" = "start", "sv_end" = "end",
  "allele" = "Allele"
))

bins_df <- left_join(bins_df, ref_alleles, by = c(
  "sv_chrom" = "chrom", "sv_start" = "start", "sv_end" = "end",
  "allele" = "Path"
)) %>%
  replace_na(list(Ref = F))


bins_df %>% write_tsv("data/bins_df.tsv.gz")
bins_df <- read_tsv("data/bins_df.tsv.gz")

# write bins with SV or SNV QTLs
bins_df %>%
  dplyr::mutate(beta.sv = estimate, beta.snp = beta) %>%
  select(a, -c(estimate, beta)) %>%
  select(bin_chrom, bin_start, bin_end,
         sv_chrom, sv_start, sv_end, allele, beta.sv, adj.p.value.sv,
         snp_chrom, snp_pos, beta.snp, adj.p.value.snp) %>%
  filter(adj.p.value.snp <= 0.05 | adj.p.value.sv <= 0.05) %>%
  write_tsv("bins.tsv.gz")

bins_gr <- makeGRangesFromDataFrame(bins_df,
  seqnames.field = "bin_chrom",
  start.field = "bin_start",
  end.field = "bin_end", keep.extra.columns = F,
  na.rm = T
)

bins_on_sv <- subjectHits(findOverlaps(svs_gr, bins_gr))

ggplot() +
  stat_ecdf(aes(x = -log10(bins_df$p.value)), color = "blue") +
  stat_ecdf(aes(x = -log10(bins_df$pvalue)), color = "red")
ggsave("pvaluedist.png")

ggplot() +
  stat_ecdf(aes(x = abs(bins_df$estimate)), color = "blue") +
  stat_ecdf(aes(x = abs(bins_df$beta)), color = "red")
ggsave("effetctdist.png")

bins_nosv_df <- bins_df[-bins_on_sv, ]

bins_sv_df <- bins_df[bins_on_sv, ] %>%
  mutate(bin_chrom = factor(bin_chrom, levels = str_sort(unique(bin_chrom), numeric = TRUE)))

bins_nosv_df <- bins_df[-bins_on_sv, ] %>%
  mutate(bin_chrom = factor(bin_chrom, levels = str_sort(unique(bin_chrom), numeric = TRUE)))

bins_sv_df %>%
  filter(adj.p.value.sv < 0.05) %>%
  filter(abs(beta) < abs(estimate))

bins_nosv_df %>%
  filter(adj.p.value.sv < 0.05) %>%
  filter(abs(beta) < abs(estimate))


genome_sv_df <- bins_df %>%
  filter(!str_detect(bin_chrom, "[YX]")) %>%
  filter(adj.p.value.sv < 0.05) %>%
  filter(abs(beta) < abs(estimate)) %>%
  group_by(bin_chrom) %>%
  summarise(chr_len = max(bin_end)) %>%
  mutate(bin_chrom = factor(bin_chrom, levels = str_sort(unique(bin_chrom), numeric = TRUE))) %>%
  arrange(bin_chrom) %>%
  mutate(tot = cumsum(as.numeric(chr_len)) - as.numeric(chr_len)) %>%
  select(-chr_len)

manhatan_sv_df <- left_join(bins_df %>% filter(!str_detect(bin_chrom, "[YX]")) %>%
                            mutate(bin_chrom = factor(bin_chrom, levels = str_sort(unique(bin_chrom), numeric = TRUE))), genome_sv_df) %>%
  filter(adj.p.value.sv < 0.05) %>%
  filter(abs(beta) < abs(estimate)) %>%
  arrange(bin_chrom, bin_end) %>%
  mutate(bin_chrom = factor(bin_chrom, levels = str_sort(unique(bin_chrom), numeric = TRUE))) %>%
  mutate(bp_cum = tot + (bin_start + bin_end) / 2)

axis_sv_df <- manhatan_sv_df %>%
  group_by(bin_chrom) %>%
  summarize(center = (max(bp_cum) + min(bp_cum)) / 2) %>%
  mutate(bin_chrom = factor(bin_chrom, levels = str_sort(unique(bin_chrom), numeric = TRUE)))


chrom_colors <- rep(c("#E64B35FF", "#4DBBD5FF"), 11)
names(chrom_colors) <- sort(axis_sv_df$bin_chrom)

manhatan_sv_p <- ggplot(manhatan_sv_df %>% filter(adj.p.value.sv < 0.05) %>%
                     filter(abs(beta) < abs(estimate)),
                     aes(bp_cum, y = -log10(adj.p.value.sv))) +
  geom_point_rast(aes(color = as.factor(bin_chrom)), alpha = 0.8, size = 1) +
  geom_hline(aes(yintercept = -log10(0.01))) +
  scale_x_continuous(label = axis_sv_df$bin_chrom, breaks = axis_sv_df$center) +
  scale_y_continuous(limits = c(0, 70), expand = c(0, 0)) +
  scale_color_manual(values = chrom_colors) +
  theme(
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
    ## axis.text.x = element_text(angle = 90)
  ) +
  labs(x = "Chromosome", title = "Leading-SV mQTL bins across the genome",
       y = "Max -log10(adj.pvalue)")

ggsave("plots/manhatan_top_sv.pdf", manhatan_sv_p, height = 7, width = 12)
ggsave("plots/manhatan_top_sv.png", manhatan_sv_p, height = 7, width = 12)

volcano_nosv_p <- bins_nosv_df %>%
  filter(adj.p.value.sv < 0.05) %>%
  filter(abs(beta) < abs(estimate)) %>%
  mutate(sv_dist = pmin(
    abs(sv_start - bin_start), abs(sv_end - bin_start),
    abs(sv_start - bin_end), abs(sv_end - bin_end)
  )) %>%
  ggplot() +
  geom_point(aes(x = estimate, y = -log10(adj.p.value.sv)), alpha = 0.5) +
  labs(title = "Top-SVs not overlapping methylation bin", x = "Effect size")
ggsave("plots/bins_nosv_volcano.pdf", volcano_nosv_p, width = 7, height = 7)

volcano_sv_p <- bins_sv_df %>%
  filter(adj.p.value.sv < 0.05) %>%
  filter(abs(beta) < abs(estimate)) %>%
  mutate(sv_dist = pmin(
    abs(sv_start - bin_start), abs(sv_end - bin_start),
    abs(sv_start - bin_end), abs(sv_end - bin_end)
  )) %>%
  ggplot() +
    geom_point(aes(x = estimate, y = -log10(adj.p.value.sv)), alpha = 0.5) +
    labs(title = "Top-SVs overlapping methylation bin", x = "Effect size")
ggsave("plots/bins_sv_volcano.pdf", volcano_sv_p, width = 7, height = 7)

volcano_p <- bins_df %>%
  filter(adj.p.value.sv < 0.05) %>%
  filter(abs(beta) < abs(estimate)) %>%
  ggplot() +
    geom_point(aes(x = estimate, y = -log10(adj.p.value.sv)), alpha = 0.5) +
    labs(title = "Leading-SV-QTLs effect sizes", x = "Effect size")
ggsave("plots/bins_volcano.pdf", volcano_p, width = 7, height = 7)



bins_df %>%
  filter(adj.p.value.sv < 0.05) %>%
  filter(abs(beta) < abs(estimate))

ref_alleles_p <- bins_df %>%
  filter(adj.p.value.sv < 0.05) %>%
  mutate(QTL = case_when(abs(beta) < abs(estimate) | is.na(beta) ~ "Leading-SV",
    abs(beta) > abs(estimate) ~ "SV")) %>%
  mutate(SV = case_when(Ref ~ "Reference", !Ref ~ "Alternative"),
    Effect = case_when(estimate > 0 ~ "Positive", estimate < 0 ~ "Negative")) %>%
  mutate(QTL = factor(QTL, levels = c("Leading-SV", "SV"))) %>%
  ggplot(aes(x = SV, fill = Effect)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = ..count..), position = position_stack(vjust = 0.5), vjust = 0.1) +
  labs(x = "SV allele", y = "Methylation bins", title = "Effect and identity of SV alleles",
       subtitle = "In methylation bins associated with SVs") +
  facet_wrap(~QTL) +
  theme(legend.position = "top") +
  scale_fill_npg() +
  scale_color_npg()


qtl_alleles_freq_p <- bins_df %>%
  filter(adj.p.value.sv < 0.05) %>%
  mutate(QTL = case_when(abs(beta) < abs(estimate) | is.na(beta) ~ "Leading-SV",
                         abs(beta) > abs(estimate) ~ "SV")) %>%
  mutate(SV = case_when(Ref ~ "Reference", !Ref ~ "Alternative"),
         Effect = case_when(estimate > 0 ~ "Positive", estimate < 0 ~ "Negative")) %>%
  mutate(QTL = factor(QTL, levels = c("Leading-SV", "SV"))) %>%
    select(sv_chrom, sv_start, sv_end, Freq, SV, QTL, allele) %>%
    distinct() %>%
    ggplot() +
    stat_ecdf(aes(x = Freq / 464 , color = QTL, linetype = SV), size = 1) +
    labs(x = "SV allele frequency", y = "Count", title = "Frequency of SV-mQTLs") +
  theme(legend.position = "right") +
  scale_color_npg() +
  scale_fill_npg()


sv_dist_df <- bind_rows(
  bins_df %>%
    filter(adj.p.value.sv <= 0.05) %>%
    filter(abs(beta) > abs(estimate)) %>%
    mutate(QTL = "SV"),
  bins_df %>%
    filter(adj.p.value.sv <= 0.05) %>%
    filter(abs(beta) < abs(estimate)) %>%
    mutate(QTL = "Leading-SV")) %>%
  mutate(QTL = case_when(abs(beta) < abs(estimate) | is.na(beta) ~ "Leading-SV",
                         abs(beta) > abs(estimate) ~ "SV")) %>%
  mutate(SV = case_when(Ref ~ "Reference", !Ref ~ "Alternative"),
         Effect = case_when(estimate > 0 ~ "Positive", estimate < 0 ~ "Negative")) %>%
  mutate(QTL = factor(QTL, levels = c("Leading-SV", "SV")))

sv_dist_top_qtl_p <- sv_dist_df %>%
  filter(adj.p.value.sv < 0.05) %>%
  ggplot() +
  stat_ecdf(aes(x = sv_dist, color = QTL, linetype = SV), size = 1) +
  scale_x_continuous(breaks = c(0, 25e3, 50e3, 75e3, 100e3)) +
  labs(x = "Distance from QTL SV (bp)",
       title = "Distance distributions in SV-QTLs") +
  scale_color_npg() +
  scale_fill_npg()
ggsave("plots/bin_dist_top_qtl.pdf", sv_dist_top_qtl_p, height = 7, width = 7)

bins_df %>%
  filter(adj.p.value.sv <= 0.05, abs(beta) < abs(estimate)) %>%
  group_by(sv_chrom, sv_start, sv_end, Ref) %>%
  summarise(span = n()) %>%
  group_by(Ref) %>%
  summarise(mean(span))

bins_df %>%
  filter(adj.p.value.sv <= 0.05, abs(beta) < abs(estimate)) %>%
  group_by(sv_chrom, sv_start, sv_end, Ref) %>%
  summarise(span = n()) %>%
  ggplot() +
  stat_ecdf(aes(x = span, color = Ref), size = 1) +
  ## scale_x_continuous(breaks = c(0, 25e3, 50e3, 75e3, 100e3)) +
  labs(x = "Number of bins",
       title = "Distance distributions in SV-QTLs") +
  scale_color_npg() +
  scale_fill_npg()

ggsave("plots/bin_dist_top_qtl.pdf", sv_dist_top_qtl_p, height = 7, width = 7)
  ggplot()

qtls2_fig <- plot_grid(volcano_p, ref_alleles_p,
                       qtl_alleles_freq_p, sv_dist_top_qtl_p,
                       nrow = 2, ncol = 2, labels = "AUTO")

ggsave("figures/mQTLs2.pdf", qtls2_fig, height = 10, width = 12)
ggsave("figures/mQTLs2.pdf.png", qtls2_fig, height = 10, width = 12)


## EXAMPLE


bins <- mclapply(list.files(pattern = "chr.*.tsv.gz"), function(p) { read_tsv(p) %>%
               separate(geneid,
                        into = c("bin_chrom", "bin_start", "bin_end"),
                        convert = T, remove = T, sep = "_") },
  mc.cores = 22, mc.preschedule = 22
)


qtl_svs <- read_tsv("data/qtl/sv_genotypes_bins.tsv.gz") %>%
  filter(!str_detect(chrom, "chr[XY]")) %>%
  select(-c(chrom, start, end)) %>%
  separate(snpid, into = c("sv_chrom", "sv_start", "sv_end", "allele"), sep = "_", convert = T) %>%
  mutate(sv_chrom = as.numeric(str_remove(sv_chrom, "chr"))) %>%
  pivot_longer(-c(sv_chrom, sv_start, sv_end, allele), names_to = "Sample", values_to = "Genotype")

qtl_bins <- read_tsv("data/qtl/qtl_top_sv_bins.tsv.gz") %>%
  select(bin_chrom, bin_start, bin_end, sv_start, sv_end, allele, 24:258) %>%
  pivot_longer(-c(bin_chrom, bin_start, bin_end, sv_start, sv_end, allele), names_to = "Sample", values_to = "Bin")


qtls <- left_join(qtl_bins, qtl_svs, by = c("bin_chrom" = "sv_chrom", "sv_start", "sv_end", "allele", "Sample"))

plot_qtl <- function(bin_df, bin_chrom_, bin_start_, bin_end_) {
  theme_set(theme_bw())
  bdf <- bin_df %>%
    filter(bin_chrom == bin_chrom_, bin_start == bin_start_, bin_end == bin_end_)
  bdf %>% ggplot(aes(x = Genotype, y = Bin)) +
    geom_beeswarm(alpha = 0.5) +
    geom_smooth(method = "lm") +
    scale_y_continuous(limits = c(0, 1)) +
    scale_x_continuous(limits = c(-1, 3), breaks = c(0, 1, 2)) +
    labs(title = paste0(bin_chrom_, ":", bin_start_, "-", bin_end_))
}

bins_df %>%
  filter(adj.p.value.sv <= 0.05, abs(estimate) > abs(beta)) %>%
  arrange(desc(estimate)) %>%
  select(bin_chrom, bin_start, bin_end, estimate, not_na, Ref, allele, sv_start, sv_end) %>%
  filter(allele == ">s812887") %>%
  arrange(bin_chrom, bin_start, bin_end) %>%
  mutate(sv_dist = pmin(
           abs(sv_start - bin_start), abs(sv_end - bin_start),
           abs(sv_start - bin_end), abs(sv_end - bin_end))) %>%
  mutate(sv_dist = case_when(bin_start > sv_start & bin_end < sv_end ~ 0,
                             T ~ sv_dist))

plot_grid(plot_qtl(qtls, 17, 44209600, 44209799),
          plot_qtl(qtls, 17, 44209800, 44209999),
          plot_qtl(qtls, 17, 44210000, 44210199),
          plot_qtl(qtls, 17, 44210200, 44210399),
          plot_qtl(qtls, 17, 44210400, 44210599),
          ncol = 2, nrow = 3, labels = "AUTO")
ggsave("plots/example_qtl.pdf")
ggsave("plots/example_qtl.png")

## effect size distributions
bind_rows(
  bins_df %>%
    ## filter(abs(estimate) >= abs(beta)) %>%
    select(estimate, beta) %>%
    mutate(QTL = "Top-QTL-SV", effect = estimate) %>%
    select(QTL, effect),
  bins_df %>%
    ## filter(abs(estimate) <= abs(beta)) %>%
    select(estimate, beta) %>%
    mutate(QTL = "Top-QTL-SNP", effect = beta) %>%
    select(QTL, effect),
) %>%
  filter(effect < 1, effect > -1) %>%
  ggplot() +
  stat_ecdf(aes(x = abs(effect), color = QTL)) +
  scale_x_continuous(limits = c(0, 0.25)) +
  labs(title = "Absolute effect size distributions")
ggsave("plots/qtl_effect_size_distribution.png")


## QTL size distributions

  bins_df %>%
    filter(adj.p.value.sv <= 0.05) %>%
    mutate(SV = case_when(Ref ~ "Reference", !Ref ~ "Alternative"),
           QTL = case_when(abs(beta) < abs(estimate) | is.na(beta) ~ "Leading-SV-QTL",
                           abs(beta) > abs(estimate) ~ "SV-QTL")) %>%
    select(bin_chrom, bin_start, bin_end, sv_start, sv_end, allele, SV, QTL) %>%
    group_by(bin_chrom, sv_start, sv_end, allele, SV, QTL) %>%
    summarise(span = sum(bin_end - bin_start)) %>%
    ggplot() +
    stat_ecdf(aes(x = abs(span), color = QTL, linetype = SV)) +
    scale_x_log10() +
  labs(title = "SV-QTL size distributions", x = "Span of QTL methylation bins (bp)") +
  scale_color_npg()
ggsave("plots/qtl_size_svs_distribution.png")

## QTL size distributions
bind_rows(
  bins_df %>%
    filter(adj.p.value.sv <= 0.05) %>%
    select(bin_chrom, bin_start, bin_end, sv_start, sv_end, allele) %>%
    group_by(bin_chrom, sv_start, sv_end, allele) %>%
    summarise(span = sum(bin_end - bin_start)) %>%
    mutate(QTL = "SV") %>%
    select(QTL, span),
  bins_df %>%
    filter(adj.p.value.snp <= 0.05) %>%
    select(bin_chrom, bin_start, bin_end, snp_pos) %>%
    group_by(bin_chrom, snp_pos) %>%
    summarise(span = sum(bin_end - bin_start)) %>%
    mutate(QTL = "SNP") %>%
    select(QTL, span)
  ) %>%
    ggplot() +
    stat_ecdf(aes(x = abs(span), color = QTL)) +
    scale_x_log10() +
  labs(title = "QTL size distributions", x = "Span of QTL methylation bins (bp)") +
  scale_color_npg()

ggsave("plots/qtl_size_distribution.png")

gwas <- lapply(list.files("data/gwas/", full.names = T), read_tsv) %>% bind_rows()

top_sv_gwas <- gwas %>% filter(abs(beta.sv) > abs(beta.snp), adj.p.value.sv < adj.p.value.snp)
top_sv_gwas %>% write_tsv("data/gwas/top_sv_gwas.tsv.gz")

## repeat masking
qtl_reps <- read_rm("data/qtl_alleles.fa.out") %>%
  as_tibble() %>%
  mutate(qname = as.numeric(qname)) %>%
  left_join(read_csv("data/qtl_alleles_index.tsv",
                     col_names = c("index", "allele")),
            by = c("qname" = "index")) %>%
  select(allele, tclass, tname)

qtl_reps <- left_join(
  bins_df %>%
    filter(allele != "*") %>%
    select(sv_chrom, sv_start, sv_end, allele) %>%
    distinct(),
  qtl_reps
) %>% distinct()

chm13_reps <- read_tsv("data/chm13v2.0_RepeatMasker_4.1.2p1.2022Apr14.bed.gz",
                       col_names = F) %>%
  mutate(chrom = str_remove(X1, "chr"), start = X2, end = X3, tname = X4,
         tclass = paste(X7, X8, sep = "/")) %>%
  select(chrom, start, end, tname, tclass) %>%
  makeGRangesFromDataFrame(keep.extra.columns = T)

bins_sv_gr <- bins_df %>%
  filter(allele == "*") %>%
  makeGRangesFromDataFrame(seqnames.field = "sv_chrom",
                           start.field = "sv_start",
                           end.field = "sv_end")


qtl_star_reps <- join_overlap_left(bins_sv_gr, chm13_reps) %>%
  as_tibble() %>%
  mutate(allele = "*") %>%
  dplyr::rename(sv_chrom = seqnames, sv_start = start, sv_end = end) %>%
  select(-c(strand, width)) %>%
  mutate(sv_chrom = as.numeric(sv_chrom)) %>%
  select(sv_chrom, sv_start, sv_end, allele, tclass, tname) %>%
  distinct()

bins_rep_df <- left_join(bins_df %>%
                         select(-c(bin_chrom, bin_start, bin_end, snp_chrom, snp_pos)),
                         bind_rows(qtl_star_reps, qtl_reps) %>%
                         distinct())

## mutate(QTL = case_when(abs(beta) < abs(estimate) | is.na(beta) ~ "Leading-SV-QTL",
##   abs(beta) > abs(estimate) ~ "SV-QTL")) %>%
## , ## mutate(QTL = factor(QTL, levels = c("Leading-SV-QTL", "SV-QTL"))) %>%


rep_alleles_p <- bins_rep_df %>%
  filter(adj.p.value.sv <= 0.05) %>%
  group_by(tclass) %>%
  mutate(N = dplyr::n()) %>%
  filter(N > 2500) %>%
  mutate(tclass = str_remove(tclass, "/undefined")) %>%
  mutate(SV = case_when(Ref ~ "Reference", !Ref ~ "Alternative"),
         Effect = case_when(estimate > 0 ~ "Positive", estimate < 0 ~ "Negative")) %>%
  select(sv_chrom, sv_start, sv_end, allele, tclass, SV, Effect) %>%
    distinct() %>%
    group_by(tclass, SV, Effect) %>%
    summarise(N_tclass_sv_effect = dplyr::n()) %>%
    group_by(SV) %>%
    mutate(Percent = N_tclass_sv_effect / sum(N_tclass_sv_effect)) %>%
    replace_na(replace = list(tclass = "None")) %>%
  ggplot() +
  geom_col(aes(y = reorder(tclass, Percent), x = Percent, fill = Effect)) +
  labs(title = "Repeats associated with SV-QTL alleles", y = "Repaet", x = "% SV alleles associated with repeat",
       subtitle = "Stratified by reference allele") +
  facet_wrap(~SV) +
  scale_fill_npg() +
  scale_color_npg() +
  theme(legend.position = "top")
ggsave("plots/qtl_sv_allele_repeatmask.pdf", width = 7, height = 7)
ggsave("plots/qtl_sv_allele_repeatmask.png", width = 7, height = 7)

snp_dist_df <- bind_rows(
  bins_df %>%
    filter(adj.p.value.sv <= 0.05) %>%
    mutate(QTL = "SV") %>%
    mutate(sv_dist = pmin(
      abs(sv_start - bin_start), abs(sv_end - bin_start),
      abs(sv_start - bin_end), abs(sv_end - bin_end)
    )) %>%
    mutate(Distance = case_when(
      bin_start > sv_start & bin_end < sv_end ~ 0,
      T ~ sv_dist
    )) %>%
    select(QTL, Distance),
  bins_df %>%
    filter(adj.p.value.snp <= 0.05) %>%
    mutate(QTL = "SNP") %>%
    mutate(Distance = pmin(
      abs(snp_pos - bin_start), abs(snp_pos - bin_start),
      abs(snp_pos - bin_end), abs(snp_pos - bin_end)
    )) %>%
    select(QTL, Distance)
)

ggplot(snp_dist_df) +
  stat_ecdf(aes(x = Distance, color = QTL)) +
  scale_color_npg() +
  labs(title = "Distance distributions in SNP-QTLs and SV-QTLs",
       subtitle = "FDR < 0.05")
ggsave("plots/bin_dist_sv_snp.pdf", width = 7, height = 7)
ggsave("plots/bin_dist_sv_snp.png", width = 7, height = 7)


## methylation state of SVs
methylation_df <- bind_rows(
  read_tsv("data/methylation_rate.tsv.gz"),
  read_tsv("data/methylation_rate_ref.tsv.gz")
) %>% select(-mc_freq)

qtl_svs <- bins_df %>%
  filter(adj.p.value.sv <= 0.05) %>%
  mutate(Effect = case_when(estimate > 0 ~ "Positive", estimate < 0 ~ "Negative")) %>%
  select(sv_chrom, sv_start, sv_end, allele, Ref, Effect) %>%
  distinct() %>%
  filter(allele != "*") %>%
  mutate(node = allele) %>%
  separate_rows(node, sep = "[<>]") %>%
  filter(node != "")
 

qtl_svs_methylation <- left_join(qtl_svs, methylation_df) %>%
  group_by(sv_chrom, sv_start, sv_end, allele, Ref, Effect) %>%
  summarise(nmc = sum(mc_rate > 0, na.rm = T), mean_mc = mean(mc_rate, na.rm = T)) %>%
  mutate(SV = case_when(Ref ~ "Reference SV", !Ref ~ "Alternative SV"))

qtl_svs_methylation %>%
  filter(!is.nan(mean_mc)) %>%
  ggplot() +
  geom_histogram(aes(x = mean_mc)) +
  facet_wrap(~SV + Effect) +
  labs(
    title = "Mean methylation rate", subtitle = "In CpGs of SV-QTL alleles",
    x = "Mean methylation rate in SV"
  )

ggsave("plots/mean_methylation_rate_SV_qtls.png")
ggsave("plots/mean_methylation_rate_SV_qtls.pdf")
