#!/usr/bin/env Rscript

# Script to identify which variants from a VCF file fall within specific genomic ranges
# and count cytosines excluding CG context
# Usage: Rscript vcf_range_finder.R --vcf input.vcf --bed regions.bed --genome reference.fa --output results.tsv

# Load required libraries
suppressPackageStartupMessages({
  library(optparse)
  library(VariantAnnotation)
  library(GenomicRanges)
  library(Biostrings)
  library(BSgenome)
  library(rtracklayer)
  library(dplyr)
  library(readr)
})

# Parse command line arguments
option_list <- list(
  make_option("--vcf", type="character", help="Path to the VCF file"),
  make_option("--bed", type="character", help="Path to the BED file with columns CHR, POS1, POS2, POS3, POS4"),
 # make_option("--genome", type="character", help="Path to the reference genome FASTA file"),
  #make_option("--output", type="character", help="Path to the output file"),
  make_option("--dir", type="character", help="F1R2 or F2R1")
)

opt_parser <- OptionParser(option_list=option_list)
args <- parse_args(opt_parser)
read_dir<-args$dir
id<-gsub("\\..*","",basename(args$vcf))


# Read VCF file
cat(sprintf("Reading VCF file: %s\n", args$vcf))
#vcf<-readVcf("/home/users/ayh/Projects/27_A3B/07_revision/EM_seq/vcf/10pg_A3B_60s.bismark.snp.fin.only_SS_lib_mut.F1R2.CtoT.NCH.rh.sample.rh.vcf")
vcf <- readVcf(args$vcf)

# Load reference genome
#cat(sprintf("Loading reference genome: %s\n", args$genome))
#reference_genome <- readDNAStringSet(args$genome)
reference_genome <- readDNAStringSet("/home/users/data/01_reference/human_g1k_v37/human_g1k_v37.fasta")
names(reference_genome) <- gsub(" .*", "", names(reference_genome))  # Clean sequence names

# Extract variant information from VCF
variants <- rowRanges(vcf)
variant_df <- data.frame(
  CHROM = seqnames(variants),
  POS = start(variants),
  REF = as.character(ref(vcf)),
  ALT = as.character(unlist(alt(vcf))),
  stringsAsFactors = FALSE
)

# Create GRanges object for variants
variant_granges <- GRanges(
  seqnames = variant_df$CHROM,
  ranges = IRanges(start = variant_df$POS, end = variant_df$POS),
  mcols = data.frame(
    REF = variant_df$REF,
    ALT = variant_df$ALT,
    VariantID = 1:nrow(variant_df)
  )
)

# Read BED file with ranges
cat(sprintf("Reading BED file: %s\n", args$bed))
#ranges_df <- read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/EM_seq/sampling/10pg_A3B_60s.sample_100.txt")%>%
ranges_df <- read_tsv(args$bed)%>%
  filter(dir==read_dir)
#ranges_df <- read_delim(args$bed, delim="\t", col_names=c("CHR", "POS1", "POS2", "POS3", "POS4"))
#ranges_df
ranges_df<-ranges_df%>%mutate(RangeID=c(1:nrow(ranges_df)))
# Create two sets of GRanges objects for the two ranges: POS1~POS2 and POS3~POS4
range1_granges <- GRanges(
  seqnames = ranges_df$CHR1,
  ranges = IRanges(start = ranges_df$POS1-1, end = ranges_df$POS2),
  mcols = data.frame(
    RangeType = "Range1",
    RangeID = 1:nrow(ranges_df)
  )
)

range2_granges <- GRanges(
  seqnames = ranges_df$CHR1,
  ranges = IRanges(start = ranges_df$POS3_2, end = ranges_df$POS4_2),
  mcols = data.frame(
    RangeType = "Range2",
    RangeID = 1:nrow(ranges_df)
  )
)

# Combine both ranges
all_ranges <- c(range1_granges, range2_granges)
#all_ranges
# Find overlaps between variants and ranges
cat("Finding variants in ranges...\n")
#variant_granges
#all_ranges
#vcf%>%nrow()
overlaps <- findOverlaps(variant_granges, all_ranges)
#overlaps%>%head(n=285)
# Create results data frame
# Get query and subject indices
query_indices <- queryHits(overlaps)
subject_indices <- subjectHits(overlaps)
  
# Create a data frame with overlaps
results <- data.frame(
  VariantID = variant_granges$mcols.VariantID[query_indices],
  CHROM = seqnames(variant_granges)[query_indices],
  POS = start(variant_granges)[query_indices],
  REF = variant_granges$mcols.REF[query_indices],
  ALT = variant_granges$mcols.ALT[query_indices],
  RangeType = all_ranges$mcols.RangeType[subject_indices],
  RangeID = all_ranges$mcols.RangeID[subject_indices],
  stringsAsFactors = FALSE
)
#results
# Add range positions based on RangeID and RangeType
results <- results %>%
  mutate(POS1=ranges_df$POS1[RangeID],
         POS2=ranges_df$POS2[RangeID],
         POS3=ranges_df$POS3_2[RangeID],
         POS4=ranges_df$POS4_2[RangeID]
  )
  
#results
# Function to count non-CG cytosines in a specific genomic range
count_non_CG_cytosines <- function(chr, start1, end1,read_dir) {
  # Check if chromosome exists in reference genome
  if(!(chr %in% names(reference_genome))) {
    warning(sprintf("Chromosome %s not found in reference genome", chr))
    return(NA)
  }
  
  # Get sequence for the range
  seq1 <- as.character(subseq(reference_genome[[chr]], start1, end1))
  #seq2 <- as.character(subseq(reference_genome[[chr]], start2, end2))
  if(read_dir=="F1R2"){
  # Count all cytosines
  adj_seq1 <- as.character(subseq(reference_genome[[chr]], start1, end1+1))
  #adj_seq2 <- as.character(subseq(reference_genome[[chr]], start2, end2+1))
  total_cytosines1 <- stringr::str_count(seq1, "C")
  #total_cytosines2 <- stringr::str_count(seq, "C")
  # Count CG contexts
  cg_contexts1 <- stringr::str_count(adj_seq1, "CG")
  #cg_contexts2 <- stringr::str_count(adj_seq2, "CG")
  
  
  # Return cytosines excluding CG context
  #return(c(total_cytosines1 - cg_contexts1,total_cytosines2-cg_contexts2))
  return(c(total_cytosines1 - cg_contexts1))
  }
  else{
    adj_seq1 <- as.character(subseq(reference_genome[[chr]], start1-1, end1))
    #adj_seq2 <- as.character(subseq(reference_genome[[chr]], start2-1, end2))
    total_cytosines1 <- stringr::str_count(seq1, "G")
    #total_cytosines2 <- stringr::str_count(seq, "G")
    # Count CG contexts
    cg_contexts1 <- stringr::str_count(adj_seq1, "CG")
    #cg_contexts2 <- stringr::str_count(adj_seq2, "GC")
    # Return cytosines excluding CG context
    #return(c(total_cytosines1 - cg_contexts1,total_cytosines2-cg_contexts2))
    return(total_cytosines1-cg_contexts1)
  }
}

tmp1<-lapply(1:nrow(ranges_df),function(x){
  df<-ranges_df[x,]
  df%>%mutate(NCH_range1=count_non_CG_cytosines(CHR1,POS1,POS2,dir))
})
ranges_df<-do.call(rbind,tmp1)
tmp2<-lapply(1:nrow(ranges_df),function(x){
  df<-ranges_df[x,]
  df%>%mutate(NCH_range2=count_non_CG_cytosines(CHR1,POS3_2,POS4_2,dir))
})
ranges_df<-do.call(rbind,tmp2)
ranges_df<-ranges_df%>%mutate(NCH_tot=NCH_range1+NCH_range2)
# Add cytosine counts to results 
cat("Counting non-CG cytosines in each range...\n")


# Sort by chromosome and position
results <- results %>% arrange(CHROM, POS, RangeID)

# Add a full region description
fin_df<-left_join(ranges_df%>%filter(dir==read_dir),
  results%>%
  group_by(RangeID)%>%
  dplyr::summarise(mut_count=n())
  )%>%dplyr::select(-info,-type)%>%
  dplyr::select(CHR1,POS1,POS2,POS3_2,POS4_2,dir,RangeID,NCH_range1,NCH_range2,NCH_tot,mut_count)%>%
  mutate(NCH_rate=mut_count/NCH_tot)
fin_df[is.na(fin_df)]<-0

fin_df%>%
  write.table(paste0(id,".",read_dir,"NCH_rate.txt"),
              sep="\t",
              quote=F,
              row.names=F)
  

