###############################################
# DNM DISTRIBUTION PER CHR - NORMALIZED BY μ
###############################################

# Load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(Rsamtools)
library(GenomicRanges)
library(BSgenome.Mmusculus.UCSC.mm39)

# STEP 1: Read and filter SNP data
df <- read.table("MpileupDeepvariant_RepeatRemoved_35bpRemoved_BlaclListGenes_Homopolymer_ART_NAT_FINAL.tab",
  header = TRUE)

df <- df %>%
  filter(chr %in% paste0("chr", 1:19)) %>%
  mutate(
    chr = as.character(chr),
    cohort = case_when(
      cohort %in% c("NAT", "Natural") ~ "NAT",
      cohort == "ART" ~ "ART"
    ),
    cohort = factor(cohort, levels = c("ART", "NAT"))
  )

# STEP 2: Convert SNPs to GRanges
snp_gr <- GRanges(
  seqnames = df$chr,
  ranges = IRanges(start = df$pos, width = 1),
  cohort = df$cohort
)

# STEP 3: Create 10Mb bins across autosomes
bin_size <- 1e7  # 10Mb
genome_bins <- tileGenome(
  seqlengths = seqlengths(BSgenome.Mmusculus.UCSC.mm39)[paste0("chr", 1:19)],
  tilewidth = bin_size,
  cut.last.tile.in.chrom = TRUE
)

# STEP 4: Count SNPs per bin per cohort
hits <- findOverlaps(snp_gr, genome_bins)
snp_df <- data.frame(
  bin = subjectHits(hits),
  cohort = snp_gr$cohort[queryHits(hits)]
)

bin_counts <- snp_df %>%
  group_by(bin, cohort) %>%
  summarise(DNM_count = n(), .groups = "drop") %>%
  pivot_wider(names_from = cohort, values_from = DNM_count, values_fill = 0)

# STEP 5: Add bin coordinates
bin_coords <- as.data.frame(genome_bins)
bin_counts <- bin_counts %>%
  mutate(
    chr = as.character(bin_coords$seqnames[bin]),
    start = bin_coords$start[bin],
    end = bin_coords$end[bin]
  )

# STEP 6: Reshape data to long format for plotting
df_long <- bin_counts %>%
  pivot_longer(cols = c("ART", "NAT"), names_to = "Cohort", values_to = "DNM_count") %>%
  mutate(
    midpoint_Mb = (start + end) / 2 / 1e6,  # bin midpoint in Mb
    Cohort = factor(Cohort, levels = c("ART", "NAT"))
  )

# STEP 7: Calculate mutation rate μ per cohort
FDR <- 0.01
FNR <- 0.03
G <- 2.23e9  # effective genome size (bp)

total_counts <- df_long %>%
  group_by(Cohort) %>%
  summarise(total = sum(DNM_count), .groups = "drop") %>%
  mutate(mu = total / (2 * G))

# STEP 8: Normalize bin counts by mutation rate
df_long <- df_long %>%
  left_join(dplyr::select(total_counts, Cohort, mu), by = "Cohort") %>%
  mutate(DNM_rate_norm = DNM_count / mu)

# STEP 8b: Statistical test between cohorts (Poisson test per bin)
df_wide <- df_long %>%
  dplyr::select(bin, Cohort, DNM_count) %>%
  tidyr::pivot_wider(names_from = Cohort, values_from = DNM_count, values_fill = 0)

df_wide <- df_wide %>%
  mutate(
    p_value = mapply(function(x, y) {
      poisson.test(c(x, y))$p.value
    }, ART, NAT),
    p_adj = p.adjust(p_value, method = "fdr")
  )

df_long <- df_long %>%
  left_join(df_wide %>% dplyr::select(bin, p_value, p_adj), by = "bin")

# STEP 9: Add chromosome lengths (in Mb)
chrom_lengths <- data.frame(
  chr = paste0("chr", 1:19),
  chr_length_Mb = c(195471971, 182113224, 160039680, 156508116, 151834684, 149736546, 
                    145441459, 129401213, 124595110, 130694993, 122082543, 120129022, 
                    120421639, 124902244, 104043685, 98207768, 94987271, 90702639, 
                    61431566) / 1e6
)

df_long <- df_long %>% mutate(chr = as.character(chr))
chrom_lengths <- chrom_lengths %>% mutate(chr = as.character(chr))

df_long <- df_long %>%
  left_join(chrom_lengths, by = "chr") %>%
  mutate(chr = factor(chr, levels = paste0("chr", 1:19)))

# STEP 10: Plot normalized DNM counts
l <- ggplot(df_long, aes(x = midpoint_Mb, y = DNM_rate_norm, group = Cohort)) +
  geom_line(aes(color = Cohort), size = 1.2) +
  geom_point(aes(fill = Cohort), shape = 22, size = 4, color = "black") +
  geom_blank(aes(x = chr_length_Mb)) +  # ensures x-axis extends to chromosome length
  facet_wrap(~chr, scales = "free_x", nrow = 10) +
  scale_fill_manual(values = c("ART" = "#E69F00", "NAT" = "#666699")) +
  scale_color_manual(values = c("ART" = "#E69F00", "NAT" = "#666699")) +
  labs(
    x = "Genomic Position (Mb)",
    y = "Mutation Rate (per bp)",
    fill = "Cohort"
  ) +
  theme_classic(base_size = 36) +
  theme(
    strip.background = element_rect(fill = "lightgray", color = NA),
    strip.text = element_text(color = "black"),
    legend.title = element_text(size = 30),
    legend.text = element_text(size = 28),
    legend.key.size = unit(2, "lines")
  ) +
  guides(
    fill = guide_legend(override.aes = list(shape = 22, size = 6))
  )

# OPTIONAL: Highlight significant bins (adjusted p < 0.05)
l <- l + geom_point(
  data = df_long %>% filter(p_adj < 0.05),
  aes(x = midpoint_Mb, y = DNM_rate_norm),
  shape = 21, size = 5, fill = "white", color = "red", stroke = 1.5
)

# Display plot
print(l)


######################
# CG content
######################
# Set paths
ref_fasta <- "mm39.fa"
variant_file <- "MpileupDeepvariant_RepeatRemoved_35bpRemoved_BlaclListGenes_Homopolymer_ART_NAT_FINAL.tab"

# Read the variant file (assumes tab-delimited with header)
variants <- read.delim(variant_file, header = TRUE, stringsAsFactors = FALSE)

# Check column names (adjust if needed)
head(variants)

# Assume columns: chr, pos, sample, cohort
# Rename if needed
colnames(variants)[1:2] <- c("chr", "pos")

# Build GRanges with 100 bp window centered at the variant
variants_gr <- GRanges(
  seqnames = variants$chr,
  ranges = IRanges(start = variants$pos - 49, end = variants$pos + 50),
  sample = variants$sample,
  cohort = variants$cohort
)

# Load reference genome
ref_genome <- FaFile(ref_fasta)
open.FaFile(ref_genome)

# Extract sequences
seqs <- getSeq(ref_genome, variants_gr)

# Compute GC content
gc_content <- letterFrequency(seqs, letters = c("G", "C"), as.prob = TRUE)
gc_content_df <- rowSums(gc_content)

# Combine with metadata
result_df <- data.frame(
  sample = mcols(variants_gr)$sample,
  cohort = mcols(variants_gr)$cohort,
  GC_content = gc_content_df
)

# Summarize GC content by sample and cohort
summary_stats <- result_df %>%
  group_by(cohort, sample) %>%
  summarise(mean_GC = mean(GC_content), .groups = "drop")


##################
# VIOLIN PLOT
#################
# Perform Wilcoxon test to compare SNP counts between cohorts
res <- wilcox.test(GC_content ~ cohort, data = result_df)
print(res)

v <- ggplot(result_df, aes(x = cohort, y = GC_content, fill = cohort)) +
  geom_violin(trim = FALSE, alpha = 0.8, color = NA) +
  geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.5, color = "black") +
  scale_fill_manual(values = c("#E69F00", "#666699")) +
  theme_classic(base_size = 46) +  # Minimal theme with larger base size
  theme(legend.title = element_blank()) +  # Remove legend title
  theme(legend.position = "none") +  # Remove legend
  labs(y = "GC Content (100 bp windows)",
       x = "") +  # Labels for the axes
  annotate("text", x = Inf, y = Inf, label = paste("Wilcoxon P-value =", round(res$p.value, 3)), 
           hjust = 1.1, vjust = 1.1, size = 10) +  # Add the p-value as annotation
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.position = "none",
    axis.text.x = element_text(size = 40),
    axis.text.y = element_text(size = 40))

v

#######################
# CpG Islands
#######################
# Load necessary libraries
library(GenomicRanges)
library(Biostrings)
library(rtracklayer)
library(tidyverse)
library(tibble)

# Set paths
ref_fasta <- "mm39.fa"
variant_file <- "MpileupDeepvariant_RepeatRemoved_35bpRemoved_BlaclListGenes_Homopolymer_ART_NAT_FINAL.tab"
CpG_file <- "CpG_Islands_ALL.bed"

# Load variant data
variant_df <- read_tsv(variant_file, col_types = cols())

# Inspect column names
colnames(variant_df)

# Make sure the variant file contains: chr, start, end, and cohort (ART/NAT)
# Adjust these names if needed
variant_gr <- GRanges(
  seqnames = variant_df$chr,
  ranges = IRanges(start = variant_df$pos, end = variant_df$pos),
  cohort = variant_df$cohort
)

# Load CpG islands BED file
CpG_gr <- import(CpG_file)

# Find overlaps
overlaps <- findOverlaps(variant_gr, CpG_gr)

# Get logical vector indicating if variant is in CpG
variant_df$in_CpG <- FALSE
variant_df$in_CpG[queryHits(overlaps)] <- TRUE

# Summarize overlaps by cohort
summary_table <- variant_df %>%
  mutate(in_CpG = ifelse(in_CpG, "Inside_CpG", "Outside_CpG")) %>%
  group_by(cohort, in_CpG) %>%
  summarise(n = n(), .groups = "drop") %>%
  pivot_wider(names_from = in_CpG, values_from = n, values_fill = 0)

print(summary_table)

# Convert summary_table to a matrix for Fisher's test
fisher_matrix <- summary_table %>%
  column_to_rownames("cohort") %>%
  as.matrix()

# Run Fisher's Exact Test
fisher_result <- fisher.test(fisher_matrix)

# Print results
print(fisher_matrix)
print(fisher_result)


result_table <- variant_df %>%
  group_by(cohort, in_CpG) %>%
  summarise(n = n(), .groups = "drop") %>%
  pivot_wider(
    names_from = in_CpG,
    values_from = n,
    values_fill = 0
  )

# Rename only if those columns exist
result_table <- result_table %>%
  rename_with(~ "Inside_CpG", .cols = any_of("TRUE")) %>%
  rename_with(~ "Outside_CpG", .cols = any_of("FALSE"))

print(result_table)

# Optional: Boxplot
CpG <- ggplot(result_table, aes(x = cohort, y = Inside_CpG, fill = cohort)) + 
  geom_bar(stat = "identity", color = "black", size = 2.5) +
  theme_classic(base_size = 46) +  # Larger base size for text elements
  scale_fill_manual(values = c("#E69F00", "#666699")) +
  ylab("dnSNV overlapping with CpG Islands") +
  xlab("Cohort") +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.position = "none",
    axis.text.x = element_text(size = 40),
    axis.text.y = element_text(size = 40))

