library(GenomicRanges)
library(parallel)
library(tidyverse)
library(broom)
library(tidymodels)

args = commandArgs(trailingOnly = TRUE)
gchrom = args[1]

sv_df <- read_tsv("data/qtl/svs.txt") %>%
  mutate(Freq = rowSums(across(-c(snpid)))) %>%
  ## filter(Freq >= 47, Freq <= 427) %>%
  filter(Freq >= 24, Freq <= 436) %>%
  select(-c(Freq)) %>%
  separate(snpid, into = c("chrom", "start", "end"), convert = T, remove = F) %>%
  filter(chrom == gchrom)

bin_df <- read_tsv(paste0("ref_mc/bychrom/", gchrom, ".tsv.gz"))

qtl_samples <- intersect(colnames(bin_df), colnames(sv_df))

sv_df <- sv_df %>%
  select(snpid, chrom, start, end, all_of(qtl_samples))

bin_df <- bin_df %>%
  select(geneid, all_of(qtl_samples))

svs_gr <- sv_df %>%
  makeGRangesFromDataFrame(keep.extra.columns = T)


bin_gr <- bin_df %>%
  select(geneid) %>%
  separate(geneid, into = c("chrom", "start", "end"), sep = "_", convert = T, remove = F) %>%
  makeGRangesFromDataFrame(keep.extra.columns = T) # %>%
  ## subsetByOverlaps(bubbles_gr, invert = T)

bin_sv_pairs <- findOverlaps(bin_gr, svs_gr, maxgap = 1e5)

bin_list <- queryHits(bin_sv_pairs)
sv_list <- subjectHits(bin_sv_pairs)

library(broom)
print(length(bin_list))
qtls <- mclapply(seq_along(bin_list), function(i) {
  if (i %% 1000 == 0) {
    print(i)
  }

  cpgi_name <- bin_df$geneid[bin_list[i]]
  sv_name <- sv_df$snpid[sv_list[i]]
  df <- tibble(
    bin = bin_df[bin_list[i], -1] %>% as.matrix() %>% as.vector(),
    sv = sv_df[sv_list[i], -c(1, 2, 3, 4)] %>%
      as.matrix() %>%
      as.vector()
  )
  n_notna <- sum(!is.na(df$bin))
  if(n_notna >= 2){
	  qtlfit <- tidy(lm(bin ~ sv, df))
	  qtlfit %>% mutate(cpgi_name = cpgi_name, sv_name = sv_name, not_na = n_notna)
	  }
}, mc.cores = 40, mc.preschedule = T)

qtls_df <- bind_rows(qtls)
write_tsv(qtls_df, paste0("data/qtl/models_MAF5_", gchrom ,".tsv.gz"))
