library(dbplyr)
library(ggplot2)

SNP = read.csv("/projects/dumont-lab/Laura/IVF/garretson_etal_filters/SNP/Table1.csv")
head(SNP)

# Define the path to the Ts/Tv summary files
path <- "/projects/dumont-lab/Laura/IVF/garretson_etal_filters/ChipSeq/persample"

# List all files matching the pattern "*.bed"
lfile <- list.files(path, pattern = "*.bed", full.names = FALSE)

# Initialize an empty data frame to store results
obj <- NULL

# Loop through each file
for (fname in lfile) {
  # Read the file, skipping the first row (header: "MODEL COUNT")
  df <- read.table(file.path(path, fname), sep = "\t", header = FALSE, skip = 1)
  head(df)
  
  # Set column names
  colnames(df) <- c("chr", "start", "stop","value")
  
  # Extract sample name
  df$sample <- sub("_.*", "", fname)
  head(df)
  
  # Extract the cell stage
  df$stage <- sub("^[^_]+_([^\\.]+)\\..*$", "\\1", fname)
  head(df)
  
  #Extract the histone modification
  df$modif <- sub(".*\\.(H3[^\\.]+)\\.bed$", "\\1", fname)
  head(df)
  
  # Append to the growing data frame
  obj <- rbind(obj, df)
}

obj

obj <- obj %>%
  mutate(cohort = ifelse(grepl("\\-87$|\\-88$", sample), "ART", "Natural"))
head(obj)

merged_data <- merge(obj, SNP, by = c("sample", "cohort"))
head(merged_data)

#subset by motif and cell stage
data <- subset(merged_data, stage == "4cell")
head(data)

#count the number of dnSNV that overlap with a peak and calculate the proportion
#result <- data %>%
#  group_by(cohort, sample, modif) %>%
#  summarise(
#    n_variants = n(),
#    n_variants_gt0 = sum(value > 0),
#    prop_variants_gt0 = n_variants_gt0 / n_variants
#  ) %>%
#  ungroup()

#result
#################################
# CALCULTE WITH dnSNV as TOTAL
##################################
result <- data %>%
  # Remove duplicate variants based on sample
  distinct(chr, start, stop, sample, modif, .keep_all = TRUE) %>%
  # Group by cohort, sample, modif, and dnSNV
  group_by(cohort, sample, modif, dnSNV) %>%
  # Calculate number of variants with value > 0 and the proportion
  summarise(
    n_variants_gt0 = sum(value > 0),
    prop_variants_gt0 = n_variants_gt0 / dnSNV,
    .groups = "drop"
  ) %>%
  # Remove any duplicates in the final result
  distinct(.keep_all = TRUE)

result

#################
# RUN STATS
####################
list_by_modif <- result %>% group_split(modif)
modif_names <- result %>% distinct(modif) %>% pull(modif)

results <- lapply(seq_along(list_by_modif), function(i) {
  modif_data <- list_by_modif[[i]]
  modif_name <- modif_names[i]
  
  test_res <- wilcox.test(prop_variants_gt0 ~ cohort, alternative = "greater", data = modif_data)
  
  data.frame(
    modif = modif_name,
    p_value = test_res$p.value,
    statistic = test_res$statistic
  )
})

results_df <- do.call(rbind, results)
print(results_df)


result$modif <- as.character(result$modif)
results_df$modif <- as.character(results_df$modif)

result_annotated <- left_join(result, results_df, by = "modif")


# Create the boxplot with jittered points
#pdf(file = "../graph/SNP_MutationRate_accumulation_Natural_ART.pdf", height = 6, width = 4.5)
cell2 = ggplot(result, aes(y = as.numeric(prop_variants_gt0), x = cohort, fill = cohort)) + 
  geom_boxplot(outlier.shape = NA, color = "black", size = 1.5) + 
  geom_point(position = position_jitterdodge(), size = 8) +  
  theme_classic(base_size = 50) +
  scale_fill_manual(values = c("#E69F00", "#666699")) +  
  theme(legend.title = element_blank()) + 
  theme(legend.position = "none") +  
  labs(y = "Proportion",
    x = "",
    title = "2-cell stage") +
  facet_wrap(. ~ modif, scales = "free") +
  geom_text(data = result_annotated %>%
      group_by(modif) %>%
      summarise(x = 1.5,
        y = max(prop_variants_gt0, na.rm = TRUE) * 1.05,
        label = paste0("p = ", signif(unique(p_value), 3))),
    aes(x = x, y = y, label = label),
    inherit.aes = FALSE,
    size = 10) +
  theme(strip.background = element_rect(fill = "grey", color = "black", size = 1.5),
    strip.text = element_text(color = "black", face = "bold"),
    plot.title = element_text(hjust = 0.5, face = "bold"))

cell2


cell4 <- ggplot(result, aes(y = as.numeric(prop_variants_gt0), x = cohort, fill = cohort)) + 
  geom_boxplot(outlier.shape = NA, color = "black", size = 1.5) + 
  geom_point(position = position_jitterdodge(), size = 8) +  
  theme_classic(base_size = 50) +
  scale_fill_manual(values = c("#E69F00", "#666699")) +  
  theme(legend.title = element_blank()) + 
  theme(legend.position = "none") +  
  labs(y = "Proportion",
       x = "",
       title = "4-cell stage") +
  facet_wrap(. ~ modif, scales = "free") +
  geom_text(data = result_annotated %>%
              group_by(modif) %>%
              summarise(x = 1.5,
                        y = max(prop_variants_gt0, na.rm = TRUE) * 1.05,
                        label = paste0("p = ", signif(unique(p_value), 3))),
            aes(x = x, y = y, label = label),
            inherit.aes = FALSE,
            size = 10) +
  theme(strip.background = element_rect(fill = "grey", color = "black", size = 1.5),
        strip.text = element_text(color = "black", face = "bold"),
        plot.title = element_text(hjust = 0.5, face = "bold"))

cell4


p1 = ggarrange(cell2,cell4,
               labels = c("A","B"),
               ncol = 2, nrow = 1, font.label = list(size = 55))

p1
