library(tidyverse)
library(Biostrings)

library(tidyverse)
library(GenomicRanges)
library(rtracklayer)
library(Biostrings)
library(parallel)
options(scipen=100)
##if want to include intron variant, make >>ENSGXXX .fa, and import that , and use that in the intron part##

# Paths to input files
gtf_file <- "/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/Homo_sapiens.GRCh37.87.chr.gtf"  # Path to GTF file
genome_fasta <- "/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/human_g1k_v37.ensembl.canonical.fa"  # Reference genome FASTA file
##already considerd gene_dir and complementary sequence##

#sequences <- readDNAStringSet(fasta_file)
#5       228383 ENST00000264932 ##exon
#5       77655432 ENST00000339292 ##upstream

# Sample tibble with transcript ID, chromosome, and position




# Load the FASTA file
genome_sequences <- readDNAStringSet(genome_fasta)
names(genome_sequences) <- sub(" .*", "", names(genome_sequences))  # Clean sequence names



# Load the GTF file and extract transcript coordinates
gtf_ori_data <- import(gtf_file)
gtf_data<-gtf_ori_data[gtf_ori_data$type!="gene"]

transcripts <- gtf_data[gtf_data$type == "transcript"]

#exons<-exonsBy(txdb,by="tx",use.names=TRUE)
###get intron###
library(GenomicRanges)
library(rtracklayer)
library(dplyr)

if(FALSE){
# Load the GTF file
gtf_file <- "path/to/your.gtf"  # Change to your actual GTF file path

# Filter exons
exons <- gtf_data[gtf_data$type == "exon"]

# Group exons by transcript
exon_groups <- split(exons, exons$transcript_id)

# Function to calculate introns from exons
get_introns <- function(exon_group) {
  if (length(exon_group) < 2) return(NULL)  # No introns if only one exon
  
  # Sort exons by start position
  exon_group <- sort(exon_group, by = ~ start)
  
  # Find intron start and end positions
  intron_starts <- end(exon_group)[-length(exon_group)] + 1
  intron_ends <- start(exon_group)[-1] - 1
  
  # Create GRanges object for introns
  introns <- GRanges(
    seqnames = seqnames(exon_group)[1],
    ranges = IRanges(start = intron_starts, end = intron_ends),
    strand = strand(exon_group)[1]
  )
  
  # Copy metadata from exons
  mcols(introns) <- mcols(exon_group)[1, , drop = FALSE]
  introns$type <- "intron"
  
  return(introns)
}

# Apply function to all transcripts
introns_list <- mclapply(exon_groups, get_introns,mc.cores=10)
intron_gr <- do.call(c, introns_list)  # Combine all intron GRanges objects

# Final GRanges object with intron annotations
intron_gr
}




# Sample tibble with genomic position and transcript ID
#vcf
vcf<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.cut.vep.fin.txt")%>%
  select(-ID,-QUAL,-FILTER,-INFO,-FORMAT,-Sample1)
#vcf
variant_data <- tibble(
  transcript_id = c("ENST00000264932", "ENST00000339292","ENST00000423372","ENST00000423372","ENST00000326734","ENST00000379370"),
  chromosome = c("5", "5","1","1","1","1"),  # Not used here but can be included if needed
  position = c(228383, 77655432,138046,134911,753768,980544)  # Example positions
)

# Function to convert genomic position to transcript position

get_transcript_sequence <- function(transcript_id, chromosome, position, gtf_data_gr, genome_sequences) {
  ##filter out intergenic variant
  if (is.na(transcript_id)){
    return(list(extracted_seq = "intergenic_variant", start_pos = NA, start = NA, end_pos = NA,transcript_len=NA))
  }
  #Remove version number from transcript ID
  transcript_base_id <- str_extract(transcript_id, "ENST[0-9]+")
  #example
  #transcript_base_id <- str_extract(variant_data$transcript_id[4], "ENST[0-9]+")
  #chromosome<-variant_data$chromosome[4]
  #position<-variant_data$position[4]
  
  gtf_data_gr<-gtf_data
  genome_sequences<-genome_sequences
  
  
  #length(transcript)
  #length(gtf_data_gr[is.na(gtf_data_gr$transcript_id)&gtf_data_gr$type=="transcript"])
  # Find the transcript in the GTF file
  #gtf_data_gr[gtf_data_gr$transcript_id == transcript_base_id]
  #gtf_data_gr[gtf_data_gr$transcript_id == transcript_base_id]
  #t_gtf_data_gr<-gtf_data_gr[gtf_data_gr$transcript_id == "ENST00000264932"&gtf_data_gr$type=="transcript"]
  #gtf_data_gr[gtf_data_gr$transcript_id == transcript_base_id]
  #gtf_data_gr[(gtf_data_gr$transcript_id == transcript_base_id)&(gtf_data_gr$type=="transcript")]
  transcript <- gtf_data_gr[gtf_data_gr$transcript_id == transcript_base_id&gtf_data_gr$type=="transcript"]
  if (length(transcript) == 0){
    return(list(extracted_seq = "absence_transcript", start_pos = NA, start = NA, end_pos = NA,transcript_len=NA))
  }
  
  
  # Get exons for the transcript
  exons <- gtf_data[gtf_data$transcript_id == transcript_base_id & gtf_data$type == "exon"]
  #exons[13]
  #position
  # Convert genomic position to transcript position
  genomic_ranges <- GRanges(seqnames = chromosome, ranges = IRanges(start = position, width = 1))
  #exons
  #print(genomic_ragnes)
  #print(exons)
  overlap <- findOverlaps(genomic_ranges, exons)
  ##original####
  ##############
  if (length(overlap) == 0){
    
    return(list(extracted_seq = "db_exons_non_overalap", start_pos = NA, start = NA, end_pos = NA,transcript_len=NA))
    
  }
  ################
  ##add non_exon_variant###
  #########################
  if(FALSE){
  if (length(overlap) == 0){ ## non exon variant##
#    gtf_data[gtf_datat$transcript_id]
#    transcript_base_id
    overlap_transcript <- gtf_data[gtf_data$transcript_id == transcript_base_id&gtf_data$type=="transcript"]
    overlap <- findOverlaps(genomic_ranges, overlap_transcript)
    transcript_start <- start(exon_hit)
    strand_direction <- as.character(strand(exon_hit)[1])
    if (strand_direction == "-") {
      transcript_pos <- end(exon_hit) - position + 1
    } else {
      transcript_pos <- position - transcript_start + 1
    }
    # Get transcript sequence
    transcript_seq <- genome_sequences[[transcript_base_id]]
    # Extract -20 to +20 region in transcript coordinates
    start_pos <- max(transcript_pos - 20, 1)
    end_pos <- min(transcript_pos + 20,length(transcript_seq))
    if (is.null(transcript_seq)) {
      return(list(extracted_seq = "absence_of_transcript", start_pos = NA, start = NA, end = NA,transcript_len=NA))
    }
    extracted_seq <- as.character(subseq(transcript_seq, start = start_pos, end = end_pos))
    
    return(list(extracted_seq = extracted_seq, start_pos = start_pos, start = transcript_pos, end_pos = end_pos,transcript_len=length(transcript_seq)))
  }
  }
  #########################
  
  # Get relative position within the transcript
  exon_hit <- exons[subjectHits(overlap)]
  if (length(exon_hit) == 0){
    return(list(extracted_seq = "non_exon_overalap", start_pos = NA, start = NA, end_pos = NA,transcript_len=NA))
  } 
  #print("pass0")
  #start(exon_hit)
  #transcript_start <- start(exon_hit)
  #trand_direction <- as.character(strand(exon_hit)[1])
  #length(transcript_seq)
  # Adjust for negative strand transcripts
  ###ADDED ### Compute transcript position from the merged exons considering gene direction
  #print("pass1")
  exon_cumsum <- cumsum(width(exons))
  exon_starts <- start(exons) # don't consider gene_dir
  exon_ends <- end(exons) # don't consider gene_dir
  transcript_start <- min(start(exons)) # don't consider gene_dir
  transcript_end <- max(end(exons)) # don't consider gene_dir
  strand_direction <- as.character(strand(transcript)[1])
  exon_index <- which(start(exons) <= position & end(exons) >= position)
  #print("pass2")
  if (strand_direction == "+") {
    transcript_pos <- ifelse(exon_index == 1,
                             position - transcript_start + 1,
                             exon_cumsum[exon_index - 1] + (position - exon_starts[exon_index]) + 1)
  } else {
    transcript_pos <- ifelse(exon_index == 1,
                             transcript_end - position + 1,
                             exon_cumsum[exon_index - 1] + (exon_ends[exon_index]-position) + 1)
  }
  #exon_cumsum[exon_index - 1] + (exon_ends[exon_index]-position) + 1
  #position
  #exons
  #transcript_start
  #transcript_end
  #transcript_pos
  #exon_index
  ### END ADDITION 
  #transcript_pos
  # Get transcript sequence
  transcript_seq <- genome_sequences[[transcript_base_id]]
  # Extract -20 to +20 region in transcript coordinates
  start_pos <- max(transcript_pos - 20, 1)
  end_pos <- min(transcript_pos + 20,length(transcript_seq))
  #print(start_pos)
  #print(end_pos)
  
  if (is.null(transcript_seq)) {
    return(list(extracted_seq = "absence_of_transcript", start_pos = NA, start = NA, end_pos = NA,transcript_len=NA))
  }
  extracted_seq <- as.character(subseq(transcript_seq, start = start_pos, end = end_pos))
  
  return(list(extracted_seq = extracted_seq, start_pos = start_pos, start = transcript_pos, end_pos = end_pos,transcript_len=length(transcript_seq)))
}
#A<-get_transcript_sequence(variant_data$transcript_id,variant_data$chromosome,variant_data$position,gtf_data,genome_sequences)

# Apply function to each row in the dataset
#vcf
#library(parallel)
print("starting")
t_res<-mclapply(1:nrow(vcf),function(x){
  print(x)
  t_df<-vcf[x,]
#  print(t_df)
  tryCatch({
    get_transcript_sequence(t_df$vep_transcript_id,t_df$`#CHROM`,t_df$POS,gtf_data,genome_sequences)
  },error=function(e){
    print(x)
  })
  results<-get_transcript_sequence(t_df$vep_transcript_id,t_df$`#CHROM`,t_df$POS,gtf_data,genome_sequences)
#  print(as.tibble(results))
  return (cbind(t_df,results)%>%as.tibble())
},mc.cores=20)
#vcf[2386,]
print("starting merge")
annot_results<-do.call(rbind,t_res)
annot_results%>%select(extracted_seq,`#CHROM`,POS,start_pos,start,end_pos,gene_dir,sig_cont)
options(scipen=999)
annot_results%>%
write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.txt",
            sep="\t",
            quote=F,
            row.names=F)


if(FALSE){
annot_results%>%select(`#CHROM`,POS,VEP_region,vep_transcript_id,extracted_seq,start_pos,start,end,transcript_len)

annot_results%>%filter(is.na(start_pos))


variant_data_results <- variant_data %>%
  mutate(result_list = pmap(list(transcript_id, chromosome, position),
                            ~ get_transcript_sequence(..1, ..2, ..3, gtf_data, genome_sequences))) %>%
  mutate(
    extracted_seq = map_chr(result_list, ~ .x$extracted_seq),
    start_pos = map_chr(result_list, ~ as.character(.x$start_pos)),
    start = map_chr(result_list, ~ as.character(.x$start)),
    end = map_chr(result_list, ~ as.character(.x$end)),
    all_info = str_c(extracted_seq, start_pos, start, end, sep = "\t")
  ) %>%
  select(-result_list)

# Print results
print(variant_data)



#######
# Load the FASTA file
fasta_file <- "/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/human_g1k_v37.ensembl.canonical.fa"  # Change this to your FASTA file path
sequences <- readDNAStringSet(fasta_file)
#5       228383 ENST00000264932 ##exon
#5       77655432 ENST00000339292 ##upstream

# Sample tibble with transcript ID, chromosome, and position
variant_data <- tibble(
  transcript_id = c("ENST00000264932", "ENST00000339292"),
  chromosome = c("5", "%"),  # Not used here but can be included if needed
  position = c(228383, 77655432)  # Example positions
)

# Function to extract -20bp to +20bp sequence from transcript
extract_sequence <- function(transcript_id, position, seq_data) {
  # Remove version number from ENST ID if necessary
  transcript_base_id <- str_extract(transcript_id, "ENST[0-9]+")
  
  # Check if the transcript exists in the FASTA file
  if (!(transcript_base_id %in% names(seq_data))) {
    return(NA)  # Return NA if transcript is not found
  }
  
  # Get full transcript sequence
  transcript_seq <- seq_data[[transcript_base_id]]
  
  # Define start and end positions
  start_pos <- max(position - 20, 1)  # Ensure we don't go below 1
  end_pos <- min(position + 20, length(transcript_seq))  # Avoid going beyond sequence length
  
  # Extract the sequence
  extracted_seq <- as.character(subseq(transcript_seq, start=start_pos, end=end_pos))
  
  return(extracted_seq)
}

# Apply function to each row in tibble
variant_data <- variant_data %>%
  mutate(sequence = map2_chr(transcript_id, position, ~extract_sequence(.x, .y, sequences)))

# Print results
print(variant_data)
}
