library(dplyr)
library(tidyverse)
library(parallel)
options(scipen=999)

redit_df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.txt")
redit_stat_df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.stat.txt")

redit_stat_df%>%
  group_by(`incl_non-incl`)%>%
  dplyr::summarise(sum_n=sum(n))%>%
  mutate(tot=sum(redit_stat_df$n))%>%
  mutate(incl_rate=sum_n/tot)

#`incl_non-incl`  sum_n    tot incl_rate
#<chr>            <dbl>  <dbl>     <dbl>
#1 incl            380211 417251    0.911 
#2 non-incl         37040 417251    0.0888


redit_stat_df%>%
  filter(`incl_non-incl`=="incl")%>%
  group_by(spliced_nascent)%>%
  dplyr::summarise(sum_n=sum(n))%>%
  mutate(tot=sum((redit_stat_df%>%
                    filter(`incl_non-incl`=="incl"))$n))%>%
  mutate(spl_rate=sum_n/tot)

#Among incl
#spliced_nascent  sum_n    tot spl_rate
#<chr>            <dbl>  <dbl>    <dbl>
#1 nascent          21892 380211   0.0576
#2 spliced         358319 380211   0.942 

redit_merge_df<-left_join(redit_df,redit_stat_df%>%select(-n))

redit_merge_incl_df<-redit_merge_df%>%
  filter(`incl_non-incl`=="incl")%>%
  filter(spliced_nascent=="spliced")%>%
  filter(!is.na(start_pos))%>%  ## exclude non-exist of canonical transcript in COSMIC db or splicing variant which is located outside of exon
  mutate(POS_C=start-start_pos+1)

####

calculate_stem_strength <- function(stem1, stem2) {
  # Count G-C and A-U pairs
  gc_pairs <- sum(str_count(paste0(stem1), "[GC]"))
  au_pairs <- sum(str_count(paste0(stem1), "[AT]"))
  
  # Calculate stem strength
  stem_strength <- 3 * gc_pairs + au_pairs
  return(stem_strength)
}

# Function to identify hairpins and bin characteristics

identify_hairpins_single <- function(sequence, cytosine_position,add_info) {
  results <- list()
  seq_len <- nchar(sequence)
  #sequence="CGTCCTCTGCTGTGACCCCTCGGGCCCCGCACCCCAGTCAC"
  #seq_len=21
  # Iterate through possible hairpin locations
  terminate<-FALSE
  for (stem_length in 1:10) {  # Stem length range
    for (loop_size in 3:11) {  # Loop size range
      for (i in 1:(seq_len - (2 * stem_length + loop_size))) {
        if(seq_len-(2*stem_length+loop_size)<0){
          #print("hit2")
          #print(add_info)
          terminate<-TRUE
          #print("error")
        }
        stem1 <- substr(sequence, i, i + stem_length - 1)
        loop_start <- i + stem_length
        loop <- substr(sequence, loop_start, loop_start + loop_size - 1)
        stem2 <- substr(sequence, loop_start + loop_size, loop_start + loop_size + stem_length - 1)
        rev_complement <- chartr("ATCG", "TAGC", stringi::stri_reverse(stem1))
        #if not (cytosine_position >= loop_start && cytosine_position <= (loop_start + loop_size - 1)){
        #}
        #if(i<1 | (loop_start + loop_size + stem_length -1)>seq_len) {
        #  print("hit")
        #  print(results)
        #  return(results)
        #}
        #print(stem1)
        #print(loop_start)
        #print(loop)
        #print(stem2)
        #print(rev_complement)
        # Check if stems are complementary
        if (stem2 == rev_complement) {
          #print(stem2)
          #print(stem1)
          #print(rev_complement)
          #print("hit")
          # Calculate stem strength
          stem_strength <- calculate_stem_strength(stem1, stem2)
          
          # Check if the specific cytosine falls within the loop
          if (cytosine_position >= loop_start && cytosine_position <= (loop_start + loop_size - 1)) {
            relative_cytosine_position <- cytosine_position - loop_start + 1  # Position within the loop
            
            # Store results
            results[[length(results) + 1]] <- add_info%>%cbind(tibble(
              `RNA sequence` = sequence,
              `Cytosine position` = cytosine_position,
              Stem1 = stem1,
              Loop = loop,
              Stem2 = stem2,
              `Stem length`=str_count(stem1),
              `Stem strength` = stem_strength,
              `Loop size` = nchar(loop),
              `Cytosine position in loop` = relative_cytosine_position,
              `Flanking nucleotides` = paste0(
                ifelse(relative_cytosine_position > 1, substr(loop, relative_cytosine_position - 1, relative_cytosine_position - 1), "-"),
                " / ",
                ifelse(relative_cytosine_position + 1 <= nchar(loop), substr(loop, relative_cytosine_position + 1, relative_cytosine_position + 1), "-")
              )
            )
            )
          }
        }
      }
      if (terminate) break
    }
    if (terminate) break
  }
#}
  #print(results)
  #print("hold")
  if (length(results)==0) {
    return(add_info%>%cbind(tibble(
      `RNA sequence` = sequence,
      `Cytosine position` = cytosine_position,
      Stem1 = "n.a.",
      Loop = "n.a.",
      Stem2 = "n.a.",
      `Stem length`=as.integer(0),
      `Stem strength` = as.integer(0),
      `Loop size` = as.integer(0),
      `Cytosine position in loop` = as.integer(0),
      `Flanking nucleotides` = "n.a."
      
    )))
  }
  return(results)
}


# Function to identify hairpins for each RNA sequence and cytosine position (parallelized)
identify_hairpins_parallel <- function(rna_tibble) {
  # Detect number of cores
  cores <-30  # Use all but one core for stability
  
  # Use mclapply for parallel processing
  results <- mclapply(1:nrow(rna_tibble), function(row) {
#    print("1")
    #print(row)
    #sequence <- (rna_tibble$twt_bp_cmpl_cont)[row]
    #cytosine_position <- (rna_tibble$POS_C)[row]
    #add_info=(rna_tibble[row,]%>%select(-POS_C,-twt_bp_cmpl_cont))
    df<-rna_tibble[row,]
    sequence <- (df$twt_bp_cmpl_cont)
    cytosine_position <- (df$POS_C)
    add_info=(df%>%select(-POS_C,-twt_bp_cmpl_cont))
    
    identify_hairpins_single(sequence, cytosine_position,add_info)
    
  }, mc.cores = cores)
  print("done")
  #print(results)
  # Flatten and combine results into a tibble
  print("fin_calcualting")
  print("merge starting")
  return(results)
}
library(parallel)
# Example tibble with RNA sequences and cytosine positions


rna_tibble<-redit_merge_incl_df%>%
  plyr::rename(c("extracted_seq"="twt_bp_cmpl_cont"))
  

#rna_tibble[1,]
  


# Identify hairpins using multi-core processing
#t_df<-rna_tibble%>%filter(twt_bp_cmpl_cont=="CAGACTTTACCACCTGAAACT")

hairpin_results <- identify_hairpins_parallel(rna_tibble)
#hairpin_results%>%
#  select(`RNA sequence`,start_pos,start, end_pos,`Cytosine position`,Stem1, Loop, Stem2, `Stem length`,`Stem strength`,`Loop size`,`Cytosine position in loop`)
rna_tibble
final_results <- do.call(rbind, lapply(hairpin_results, bind_rows))#%>%
  #group_by(`RNA sequence`)%>%
  #filter(`Stem strength`==max(`Stem strength`))
final_results%>%
write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.secondary_structure.txt",
sep="\t",
quote=F,
row.names=F)
