ibrary(Biostrings)
library(rtracklayer)
library(tidyverse)

# Set paths
ref_fasta <- "mm39.fa"
variant_file <- "MpileupDeepvariant_RepeatRemoved_35bpRemoved_BlaclListGenes_Homopolymer_ART_NAT_FINAL.tab"

# Read the variant file (assumes tab-delimited with header)
variants <- read.delim(variant_file, header = TRUE, stringsAsFactors = FALSE)

# Check column names (adjust if needed)
head(variants)

# Assume columns: chr, pos, sample, cohort
# Rename if needed
colnames(variants)[1:2] <- c("chr", "pos")

# Build GRanges with 100 bp window centered at the variant
variants_gr <- GRanges(
  seqnames = variants$chr,
  ranges = IRanges(start = variants$pos - 49, end = variants$pos + 50),
  sample = variants$sample,
  cohort = variants$cohort
)

# Load reference genome
ref_genome <- FaFile(ref_fasta)
open.FaFile(ref_genome)

# Extract sequences
seqs <- getSeq(ref_genome, variants_gr)

# Compute GC content
gc_content <- letterFrequency(seqs, letters = c("G", "C"), as.prob = TRUE)
gc_content_df <- rowSums(gc_content)

# Combine with metadata
result_df <- data.frame(
  sample = mcols(variants_gr)$sample,
  cohort = mcols(variants_gr)$cohort,
  GC_content = gc_content_df
)

# Summarize GC content by sample and cohort
summary_stats <- result_df %>%
  group_by(cohort, sample) %>%
  summarise(mean_GC = mean(GC_content), .groups = "drop")

# Save result
write.table(summary_stats, "GC_content_by_sample_cohort.tsv", sep = "\t", row.names = FALSE, quote = FALSE)
