library(GenomicRanges)
library(Biostrings)
library(rtracklayer)
library(tidyverse)
library(tibble)

# Set paths
ref_fasta <- "mm39.fa"
variant_file <- "MpileupDeepvariant_RepeatRemoved_35bpRemoved_BlaclListGenes_Homopolymer_ART_NAT_FINAL.tab"
CpG_file <- "CpG_Islands_ALL.bed"

# Load variant data
variant_df <- read_tsv(variant_file, col_types = cols())

# Inspect column names
colnames(variant_df)

# Make sure the variant file contains: chr, start, end, and cohort (ART/NAT)
# Adjust these names if needed
variant_gr <- GRanges(
  seqnames = variant_df$chr,
  ranges = IRanges(start = variant_df$pos, end = variant_df$pos),
  cohort = variant_df$cohort
)

# Load CpG islands BED file
CpG_gr <- import(CpG_file)

# Find overlaps
overlaps <- findOverlaps(variant_gr, CpG_gr)

# Get logical vector indicating if variant is in CpG
variant_df$in_CpG <- FALSE
variant_df$in_CpG[queryHits(overlaps)] <- TRUE

# Summarize overlaps by cohort
summary_table <- variant_df %>%
  mutate(in_CpG = ifelse(in_CpG, "Inside_CpG", "Outside_CpG")) %>%
  group_by(cohort, in_CpG) %>%
  summarise(n = n(), .groups = "drop") %>%
  pivot_wider(names_from = in_CpG, values_from = n, values_fill = 0)

print(summary_table)

# Convert summary_table to a matrix for Fisher's test
fisher_matrix <- summary_table %>%
  column_to_rownames("cohort") %>%
  as.matrix()

# Run Fisher's Exact Test
fisher_result <- fisher.test(fisher_matrix)

# Print results
print(fisher_matrix)
print(fisher_result)

