library(VariantAnnotation)
library(dplyr)
library(tidyr)

# Load VCF
vcf <- readVcf("MpileupDeepvariant_RepeatRemoved_35bpRemoved_BlaclListGenes_Homopolymer_ART_NAT_FINAL.vcf", genome = "mm39")

# Extract data
ad <- geno(vcf)$AD    # Allelic depth
dp <- geno(vcf)$DP    # Total depth
gt <- geno(vcf)$GT    # Genotype

# Sample names
samples <- colnames(ad)
variants <- rowRanges(vcf)
variant_ids <- paste(seqnames(variants), start(variants), sep = ":")

# Initialize result list
result_list <- list()

# Loop through samples
for (s in samples) {
  ad_sample <- ad[, s]
  dp_sample <- dp[, s]
  gt_sample <- gt[, s]
  
  # Get REF and ALT reads from AD
  ref_reads <- sapply(ad_sample, function(x) if (length(x) >= 2) x[1] else NA)
  alt_reads <- sapply(ad_sample, function(x) if (length(x) >= 2) x[2] else NA)
  
  # Calculate ALT allele proportion
  alt_prop <- alt_reads / (ref_reads + alt_reads)
  
  # Create dataframe
  df <- data.frame(
    variant_id = variant_ids,
    sample = s,
    seqnames = as.character(seqnames(variants)),
    pos = start(variants),
    genotype = gt_sample,
    DP = dp_sample,
    REF_reads = ref_reads,
    ALT_reads = alt_reads,
    ALT_proportion = alt_prop,
    stringsAsFactors = FALSE
  )
  
  result_list[[s]] <- df
}

# Combine into one table (optional)
combined_df <- bind_rows(result_list)

# View result
head(combined_df)

combined_df <- subset(combined_df, genotype != "0/0")
combined_df <- subset(combined_df, genotype != "./.")
result_df <- combined_df %>%
  mutate(cohort = ifelse(grepl("\\-87$|\\-88$", sample), "ART", "Natural"))

head(result_df)

####################
# DENSITY PLOT
####################
d = ggplot(result_df, aes(x = as.numeric(ALT_proportion), fill = cohort)) +
  geom_density(alpha = 0.5) +  # Transparent density curves
  theme_classic(base_size = 36) +
  scale_fill_manual(values = c("#E69F00", "#666699")) +
  labs(
    x = "ALT Allele Proportion",
    y = "Density"
  ) +
  theme(
    legend.title = element_blank()
  )
d 

##########################
# BOXPLOT
###########################
library(ggplot2)
library(ggpubr)
library(cowplot)

head(result_df)

res <- wilcox.test(ALT_proportion ~ cohort, data = result_df)
print(res)

# Create the boxplot with jittered points
b = ggplot(result_df, aes(y = as.numeric(ALT_proportion), x = cohort, fill = cohort)) + 
  geom_boxplot(outlier.shape = NA, color = "black", size = 1.2) +
  #geom_point(position = position_jitterdodge(), size = 5) +  # Jittered points
  theme_classic(base_size = 36) +  # Minimal theme with larger base size
  scale_fill_manual(values = c("#E69F00", "#666699")) +  # Custom colors for the cohorts
  theme(legend.title = element_blank()) +  # Remove legend title
  theme(legend.position = "none") +  # Remove legend
  labs(y = "ALT allele proportion", x = "") +  # Labels for the axes
  annotate("text", x = Inf, y = Inf, label = paste("Wilcoxon P-value =", round(res$p.value, 3)), 
           hjust = 1.1, vjust = 1.1, size = 10) +  # Add the p-value as annotation
  theme(axis.text = element_text(size = 28), axis.title = element_text(size = 28))  # Larger axis text


