library(dplyr)
library(tidyverse)
options(scipen=999)
depth_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WTS/13_RNA_editing_bam_process/depth/v2",
                        "sum.txt",
                        full.names=T)
depth_files<-depth_files[!grepl("star2",depth_files)]
depth_tmp<-lapply(depth_files,function(x){
  read_tsv(x,col_names=c("depth"),show_col_types =FALSE)%>%mutate(id=gsub(".F.*R.*","",basename(x)))%>%mutate(read_type=gsub(".depth.sum.txt","",gsub("^.*[-_]..","",basename(x))))
})

depth_df<-do.call(rbind,depth_tmp)

depth_sum_df<-left_join(depth_df,depth_df%>%
                          group_by(id)%>%
                          dplyr::summarise(tot_depth=sum(depth)))%>%
  plyr::rename(c("id"="new_id"))


metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WTS/13_RNA_editing_bam_process/dna/depth/metadata.v2.txt")%>%
  mutate(new_id=gsub(".F.*","",basename(F1R2)))


depth_merge_df<-left_join(depth_sum_df,metadata)


depth_merge_fil_df<-depth_merge_df%>%
  filter(APOBEC%in%c("A3A","A3B"))%>%
  filter(batch%in%c("bat1","bat3"))%>%
  filter(time%in%c("0h","48h"))


depth_merge_fil_df<-depth_merge_fil_df%>%mutate(depth_ratio=tot_depth/3119214835)

depth_fin_df<-depth_merge_fil_df%>%select(id,new_id,depth_ratio)%>%unique()%>%
  plyr::rename(c("id"="new_id","new_id"="id"))
depth_merge_fil_df%>%
  select(new_id,tot_depth)%>%unique()%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig2/redit_depth.txt",
              sep="\t",quote=F,row.names=F)

rna_editing_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WTS/14_RNA_editing_integrate/star/varscan/final/reheader/APOBEC_rna_editing.v8.tsv")%>%
  mutate(vaf=ifelse(align_dir=="F1R2",F1R2_var_readc/F1R2_tot_readc,F2R1_var_readc/F2R1_tot_readc))%>%
  plyr::rename(c("sig_cont"="DNA_sig_cont"))
rna_editing_df<-left_join(rna_editing_df,depth_fin_df)

rna_editing_df%>%
  filter(grepl("A>G",sig_cont)|grepl("C>T",sig_cont))%>%
  mutate(mut_type=ifelse(grepl("A>G",sig_cont),"A>G","C>T"))%>%
  group_by(new_id,mut_type)%>%
  dplyr::summarise(raw_sum_count=n())%>%
  spread(mut_type,raw_sum_count)%>%
  print(n=100)


rna_editing_filter_df<-rna_editing_df%>%
  filter(new_id%in%depth_fin_df$new_id)


rna_editing_filter_df<-rna_editing_filter_df%>%
  mutate(cor_var_readc=var_readc/depth_ratio,
         cor_depth=depth/depth_ratio,
         cor_F1R2_var_readc=F1R2_var_readc/depth_ratio,
         cor_F2R1_var_readc=F2R1_var_readc/depth_ratio,
         cor_F1R2_tot_readc=F1R2_tot_readc/depth_ratio,
         cor_F2R1_tot_readc=F2R1_tot_readc/depth_ratio)%>%
  select(id,new_id,depth_ratio,CHROM,POS,REF,ALT,gene_dir,align_dir,cor_var_readc,cor_depth,cor_F1R2_var_readc,cor_F2R1_var_readc,cor_F1R2_tot_readc,cor_F2R1_tot_readc,DNA_sig_cont,rescue,Func_refGene,Gene_refGene,ExonicFunc_refGene)

rna_editing_filter_df<-rna_editing_filter_df%>%filter(gene_dir!=".")

rna_id_c<-(rna_editing_filter_df%>%filter(cor_depth>=4,cor_var_readc>=3))$new_id%>%unique()


rna_editing_filter_sample_df<-rna_editing_filter_df%>%
  filter(new_id%in%rna_id_c)%>%
  mutate(info=paste(CHROM,POS,REF,ALT,align_dir,sep="_"))

rna_editing_filter_sample_df<-rna_editing_filter_sample_df%>%
  mutate(APOBEC=ifelse(grepl("A3A",new_id),"A3A","A3B"))

rna_editing_filter_sample_stat_df<-rna_editing_filter_sample_df%>%
  group_by(APOBEC,info)%>%
  dplyr::summarise(n=n())

rna_editing_filter_sample_df%>%filter(info%in%(rna_editing_filter_sample_stat_df%>%filter(n>1))$info)%>%
  select(new_id)%>%
  unique()


##exclude 0h
rna_editing_filter_sample_oe_df<-rna_editing_filter_df%>%
  filter(new_id%in%rna_id_c)%>%
  filter(!grepl("0h",new_id))%>%
  mutate(info=paste(CHROM,POS,REF,ALT,align_dir,sep="_"))

rna_editing_filter_sample_oe_df%>%select(new_id)%>%unique()

rna_editing_filter_sample_oe_df<-rna_editing_filter_sample_oe_df%>%
  mutate(APOBEC=ifelse(grepl("A3A",new_id),"A3A","A3B"))

rna_editing_filter_sample_oe_stat_df<-rna_editing_filter_sample_oe_df%>%
  group_by(APOBEC,info)%>%
  dplyr::summarise(n=n())



#rna_editing_filter_sample_df%>%filter(info%in%(rna_editing_filter_sample_stat_df%>%filter(n>1))$info)%>%
#  write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/hotspot/RNA_editing.hotspot.tsv",
#              quote=F,
#              row.names=F,
#              sep="\t")

library(ggVennDiagram)
library(ggplot2, lib.loc = "~/R-lib")
library(ggVennDiagram)

rna_editing_filter_sample_CtoT_df<-rna_editing_filter_sample_df%>%
  filter(grepl("C>T",DNA_sig_cont))

##1. common sites b/w A3A and A3B (not hotspot), based on the position, not consider sample number
rna_editing_filter_sample_CtoT_df%>%filter(APOBEC=="A3A")%>%select(id)%>%unique()
rna_editing_filter_sample_CtoT_df%>%filter(APOBEC=="A3B")%>%select(id)%>%unique()
A3A_site<-(rna_editing_filter_sample_CtoT_df%>%filter(APOBEC=="A3A"))$info%>%unique()
A3B_site<-(rna_editing_filter_sample_CtoT_df%>%filter(APOBEC=="A3B"))$info%>%unique()

site_sets <- list(Group1 = A3A_site, Group2 = A3B_site)

p_ven<-ggVennDiagram(site_sets, label_alpha = 0, category.names = c("A3A", "A3B")) +
  scale_fill_gradient(low = "white", high = "blue")+ theme(text = element_text(angle = 45))
p_ven
rna_editing_filter_sample_CtoT_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/hotspot/RNA_editing.CtoT.tsv",
              sep="\t",
              quote=F,
              row.names=F)
rna_editing_filter_sample_CtoT_df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/hotspot/RNA_editing.CtoT.tsv")
ggsave("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/hotspot/RNA_editing_common_site.venn.pdf",p_ven,
       height=8,width=10)

##1-2 total TC>N
rna_editing_filter_sample_CtoT_df%>%filter(APOBEC=="A3A")%>%select(id)%>%unique()
rna_editing_filter_sample_CtoT_df%>%filter(APOBEC=="A3B")%>%select(id)%>%unique()
A3A_TCN_site<-(rna_editing_filter_sample_CtoT_df%>%filter(APOBEC=="A3A")%>%filter(grepl("TC>T",DNA_sig_cont)))$info%>%unique()
A3B_TCN_site<-(rna_editing_filter_sample_CtoT_df%>%filter(APOBEC=="A3B")%>%filter(grepl("TC>T",DNA_sig_cont)))$info%>%unique()

TCN_site_sets <- list(Group1 = A3A_TCN_site, Group2 = A3B_TCN_site)

p_TCN_ven<-ggVennDiagram(TCN_site_sets, label_alpha = 0, category.names = c("A3A", "A3B")) +
  scale_fill_gradient(low = "white", high = "blue")+ theme(text = element_text(angle = 45))
p_TCN_ven
rna_editing_filter_sample_CtoT_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/hotspot/RNA_editing.CtoT.tsv",
              sep="\t",
              quote=F,
              row.names=F)
p_TCN_ven
ggsave("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/hotspot/RNA_editing_common_site.venn.TCN.pdf",p_TCN_ven,
       height=8,width=10)

##2. common sites b/w hotspot of A3A and A3B, among A3A>1 and among A3B>1
rna_editing_filter_sample_CtoT_df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/hotspot/RNA_editing.CtoT.tsv")%>%
  mutate(condition=ifelse(grepl("0h",new_id),"0h", ifelse(grepl("100ng",new_id),"0.1ug","3ug")))
rna_editing_filter_sample_CtoT_stat_df<-rna_editing_filter_sample_CtoT_df%>%
  group_by(APOBEC,condition,info)%>%
  dplyr::summarise(n=n())
rna_editing_filter_sample_CtoT_stat_spr_df<-rna_editing_filter_sample_CtoT_stat_df%>%spread(condition,n)
rna_editing_filter_sample_CtoT_stat_spr_df[is.na(rna_editing_filter_sample_CtoT_stat_spr_df)]<-0
rna_editing_filter_sample_CtoT_stat_spr_df<-rna_editing_filter_sample_CtoT_stat_spr_df%>%mutate(type=ifelse(`0.1ug`+`3ug`==0&`0h`==1,"zero_uniq",
                                                                                                            ifelse(`0h`>1&`0.1ug`+`3ug`==0,"0h_recur",
                                                                                                                   ifelse(`0.1ug`+`3ug`==1&`0h`==0,"uniq","recur"))))

rna_editing_filter_sample_CtoT_stat_spr_df%>%group_by(APOBEC,type)%>%dplyr::summarise(n=n())                                                                

rna_editing_CtoT_hotspot_df<-rbind(
  rna_editing_filter_sample_CtoT_df%>%
    filter(APOBEC=="A3A")%>%
    filter(info%in%(rna_editing_filter_sample_CtoT_stat_spr_df%>%filter(APOBEC=="A3A"&type%in%c("0h_recur","recur")))$info),
  rna_editing_filter_sample_CtoT_df%>%
    filter(APOBEC=="A3B")%>%
    filter(info%in%(rna_editing_filter_sample_CtoT_stat_spr_df%>%filter(APOBEC=="A3B"&type%in%c("0h_recur","recur")))$info)
)
rna_editing_CtoT_hotspot_df<-left_join(rna_editing_CtoT_hotspot_df,rna_editing_filter_sample_CtoT_stat_spr_df%>%select(APOBEC,info,type))

A3A_hotspot_site<-(rna_editing_CtoT_hotspot_df%>%filter(APOBEC=="A3A")%>%filter(type=="recur"))$info%>%unique()
A3B_hotspot_site<-(rna_editing_CtoT_hotspot_df%>%filter(APOBEC=="A3B")%>%filter(type=="recur"))$info%>%unique()

hotspot_site_sets <- list(Group1 = A3A_hotspot_site, Group2 = A3B_hotspot_site)
library(ggVennDiagram)
p_ven_hotspot<-ggVennDiagram(hotspot_site_sets, label_alpha = 0, category.names = c("A3A", "A3B")) +
  scale_fill_gradient(low = "white", high = "blue")+ theme(text = element_text(angle = 45))
p_ven_hotspot
ggsave("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/hotspot/RNA_editing_common_site.hotspot.venn.pdf",p_ven_hotspot,
       height=8,width=10)

##3. common sites b/w hotspot of A3A and A3B, among A3A>1 and among A3B>1, TCN context


#rna_editing_CtoT_hotspot_df<-rna_editing_filter_sample_CtoT_df%>%filter(info%in%(rna_editing_filter_sample_CtoT_stat_df%>%filter(n>1))$info)

A3A_hotspot_TCN_site<-(rna_editing_CtoT_hotspot_df%>%filter(APOBEC=="A3A")%>%filter(grepl("TC>",DNA_sig_cont))%>%filter(type=="recur"))$info%>%unique()
A3B_hotspot_TCN_site<-(rna_editing_CtoT_hotspot_df%>%filter(APOBEC=="A3B")%>%filter(grepl("TC>",DNA_sig_cont))%>%filter(type=="recur"))$info%>%unique()

hotspot_site_TCN_sets <- list(Group1 = A3A_hotspot_TCN_site, Group2 = A3B_hotspot_TCN_site)

p_ven_hotspot_TCN<-ggVennDiagram(hotspot_site_TCN_sets, label_alpha = 0, category.names = c("A3A", "A3B")) +
  scale_fill_gradient(low = "white", high = "blue")+ theme(text = element_text(angle = 45))
p_ven_hotspot_TCN
ggsave("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/hotspot/RNA_editing_common_site.hotspot.TCN.venn.pdf",p_ven_hotspot_TCN,
       height=8,width=10)





##4. compare secondary structure
redit_str_df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.secondary_structure.txt")


redit_str_df<-redit_str_df%>%mutate(GC_count_in_stem=ifelse(is.na(Stem1),0,str_count(Stem1,"[GC]")))%>%mutate(TCN=ifelse(grepl("TC>",sig_cont),"TCN","non-TCN"))


#redit_str_max_df<-redit_str_df%>%
#  group_by(`#CHROM`,POS,id)%>%
# filter(`Stem strength`==max(`Stem strength`))%>%
#  filter(GC_count_in_stem==max(GC_count_in_stem))%>%
#  mutate(multi=ifelse(n()>1,"y","n"))%>%
#  ungroup()
#colnames(redit_str_df)
redit_str_max_df<-redit_str_df%>%
  group_by(`#CHROM`,POS)%>%
  filter(`Stem strength`==max(`Stem strength`))%>%
  filter(GC_count_in_stem==max(GC_count_in_stem))%>%
  slice(which.min(`Loop size`)) %>%
  ungroup()

redit_str_max_df<-redit_str_max_df%>%mutate(APOBEC=ifelse(grepl("A3A",new_id),"A3A","A3B"))%>%mutate(TCN=ifelse(grepl("TC>",sig_cont),"TCN","non-TCN"))

redit_str_max_collapse_df<-redit_str_max_df%>%
  select(-c(9:18))%>%
  unique()

redit_str_max_collapse_df<-redit_str_max_collapse_df%>%mutate(`Stem strength`=factor(`Stem strength`,levels=c(0:27)))
redit_str_max_collapse_df<-redit_str_max_collapse_df%>%mutate(`Stem length`=factor(`Stem length`,levels=c(0:10)))
redit_str_max_collapse_df<-redit_str_max_collapse_df%>%mutate(`Loop size`=factor(`Loop size`,levels=c(0:11)))
redit_str_max_collapse_df<-redit_str_max_collapse_df%>%mutate(`Cytosine position in loop`=factor(`Cytosine position in loop`,levels=c(0:11)))
redit_str_max_collapse_df%>%select(`Cytosine position in loop`)

redit_str_max_collapse_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.secondary_structure.max_collapse.txt",
              sep="\t",
              quote=F,
              row.names=F)
redit_str_max_collapse_df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.secondary_structure.max_collapse.txt")
redit_str_max_collapse_df<-redit_str_max_collapse_df%>%mutate(info=paste(`#CHROM`,POS,sep="_"))

A3A_hotspot_site_info<-gsub("_[GC]_.*","",A3A_hotspot_site)
A3B_hotspot_site_info<-gsub("_[GC]_.*","",A3B_hotspot_site)

common_hotspot_site_info<-intersect(A3A_hotspot_site_info,A3B_hotspot_site_info)
A3A_uniq_hotspot_site_info<-setdiff(A3A_hotspot_site_info,A3B_hotspot_site_info)
A3B_uniq_hotspot_site_info<-setdiff(A3B_hotspot_site_info,A3A_hotspot_site_info)


redit_str_max_collapse_sim_df<-redit_str_max_collapse_df%>%
  select(`#CHROM`,POS,REF,ALT,sig_cont,VEP_region,vep_gene_name,"Cytosine position","Stem1","Loop","Stem2","Stem length","Stem strength","Loop size","Cytosine position in loop","Flanking nucleotides","GC_count_in_stem","TCN","info")%>%
  unique()



redit_str_max_collapse_sim_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.secondary_structure.max_collapse.sim.txt",
              sep="\t",
              quote=F,
              row.names=F)
redit_str_max_collapse_sim_df<-read_tsv("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.secondary_structure.max_collapse.sim.txt")
A3A_hotspot_ss_df<-redit_str_max_collapse_sim_df%>%
  filter(info%in%A3A_uniq_hotspot_site_info)


p_A3A_hotspot_structure<-A3A_hotspot_ss_df%>%filter(TCN=="TCN")%>%
  ggplot(aes(x=`Cytosine position in loop`,col="black"))+
  geom_histogram(stat="count")+
  facet_wrap(~`Loop size`)+
  theme_bw()
p_A3A_hotspot_structure
ggsave("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/sample/A3A_hotspot_TCN_structure.pdf",p_A3A_hotspot_structure,
       height=9,width=10)



# Calculate counts from the original data, ensuring consistent types
A3A_all_positions <- A3A_hotspot_ss_df %>%filter(`Loop size`!=0)%>%
  filter(TCN == "TCN") %>%
  # Get the unique loop sizes
  distinct(`Loop size`) %>%
  # For each loop size, create a sequence from 1 to that size
  rowwise() %>%
  mutate(
    pos_list = list(1:`Loop size`)
  ) %>%
  unnest(pos_list) %>%
  rename(`Cytosine position in loop` = pos_list)

# Calculate counts from the original data, ensuring consistent types
A3A_counts_data <- A3A_hotspot_ss_df %>%filter(`Loop size`!=0)%>%
  filter(TCN == "TCN") %>%
  # Convert to character first to ensure consistent types
  mutate(`Cytosine position in loop` = as.integer(as.character(`Cytosine position in loop`))) %>%
  group_by(`Loop size`, `Cytosine position in loop`) %>%
  summarize(count = n(), .groups = "drop")

# Join both datasets
A3A_complete_data$`Loop size`
A3A_complete_data <- A3A_all_positions %>%
  left_join(
    A3A_counts_data,
    by = c("Loop size", "Cytosine position in loop")
  ) %>%
  # Replace NA counts with 0
  mutate(count = ifelse(is.na(count), 0, count))
A3A_complete_data_excl<-A3A_complete_data%>%filter(`Loop size`!=0)
A3A_complete_data_excl$`Loop size`<-factor(A3A_complete_data_excl$`Loop size`,levels=c(1:11))
A3A_complete_data_excl$`Cytosine position in loop`<-factor(A3A_complete_data_excl$`Cytosine position in loop`,levels=c(1:11))
A3A_complete_data_excl%>%arrange(`Cytosine position in loop`)
#A3B_complete_data_excl%>%print(n=100)
A3A_complete_data_excl<-A3A_complete_data_excl%>%filter(!is.na(`Cytosine position in loop`))
# Now plot
p_A3A_hotspot_structure_excl<-ggplot(A3A_complete_data_excl, aes(x = factor(`Cytosine position in loop`), y = count)) +
  geom_bar(stat = "identity", fill = "steelblue",col="black") +
  facet_grid(. ~ `Loop size`, switch = "x", scales = "free_x", space = "free_x") +
  scale_y_continuous(expand = expansion(mult = c(0, NA)),
                     limits=c(0,300)) +
  theme_bw() +
  theme(
    #panel.border = element_blank(),
    axis.line = element_line(color = "black"),
    strip.placement = "outside",
    strip.text.x = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
    panel.spacing = unit(1, "lines")
  ) +
  labs(
    x = "Cytosine position in loop",
    y = "Count"
  )

p_A3A_hotspot_structure_excl
ggsave("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/sample/A3A_hotspot_TCN_structure.excl.pdf",p_A3A_hotspot_structure_excl,
       height=3,width=10)


A3B_hotspot_ss_df<-redit_str_max_collapse_sim_df%>%
  filter(info%in%A3B_uniq_hotspot_site_info)



p_A3B_hotspot_structure<-A3B_hotspot_ss_df%>%filter(TCN=="TCN")%>%#filter(`Cytosine position in loop`!=0)%>%
  ggplot(aes(x=`Cytosine position in loop`,col="black"))+
  geom_histogram(stat="count")+
  facet_wrap(~`Loop size`)+
  theme_bw()
p_A3B_hotspot_structure
ggsave("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/sample/A3B_hotspot_TCN_structure.pdf",p_A3B_hotspot_structure,
       height=5,width=10)
A3B_all_positions <- A3B_hotspot_ss_df %>%filter(`Loop size`!=0)%>%
  filter(TCN == "TCN") %>%
  # Get the unique loop sizes
  distinct(`Loop size`) %>%
  # For each loop size, create a sequence from 1 to that size
  rowwise() %>%
  mutate(
    pos_list = list(1:`Loop size`)
  ) %>%
  unnest(pos_list) %>%
  rename(`Cytosine position in loop` = pos_list)

# Calculate counts from the original data, ensuring consistent types
A3B_counts_data <- A3B_hotspot_ss_df %>%filter(`Loop size`!=0)%>%
  filter(TCN == "TCN") %>%
  # Convert to character first to ensure consistent types
  mutate(`Cytosine position in loop` = as.integer(as.character(`Cytosine position in loop`))) %>%
  group_by(`Loop size`, `Cytosine position in loop`) %>%
  summarize(count = n(), .groups = "drop")

# Join both datasets
A3B_complete_data$`Loop size`
A3B_complete_data <- A3B_all_positions %>%
  left_join(
    A3B_counts_data,
    by = c("Loop size", "Cytosine position in loop")
  ) %>%
  # Replace NA counts with 0
  mutate(count = ifelse(is.na(count), 0, count))
A3B_complete_data_excl<-A3B_complete_data%>%filter(`Loop size`!=0)
A3B_complete_data_excl$`Loop size`<-factor(A3B_complete_data_excl$`Loop size`,levels=c(1:11))
A3B_complete_data_excl$`Cytosine position in loop`<-factor(A3B_complete_data_excl$`Cytosine position in loop`,levels=c(1:11))
A3B_complete_data_excl%>%arrange(`Cytosine position in loop`)
A3B_complete_data_excl%>%print(n=100)
A3B_complete_data_excl<-A3B_complete_data_excl%>%filter(!is.na(`Cytosine position in loop`))
# Now plot
p_A3B_hotspot_structure_excl<-ggplot(A3B_complete_data_excl, aes(x = factor(`Cytosine position in loop`), y = count)) +
  geom_bar(stat = "identity", fill = "steelblue",col="black") +
  facet_grid(. ~ `Loop size`, switch = "x", scales = "free_x", space = "free_x") +
  scale_y_continuous(expand = expansion(mult = c(0, NA)),
                     limits=c(0,600),
                     breaks=c(0,200,400,600),
                     labels=c(0,200,400,600)) +
  theme_bw() +
  theme(
    #panel.border = element_blank(),
    axis.line = element_line(color = "black"),
    strip.placement = "outside",
    strip.text.x = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
    panel.spacing = unit(1, "lines")
  ) +
  labs(
    x = "Cytosine position in loop",
    y = "Count"
  )


p_A3B_hotspot_structure_excl
ggsave("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/sample/A3B_hotspot_TCN_structure.excl.pdf",p_A3B_hotspot_structure_excl,
       height=3,width=10)


common_hotspot_ss_df<-redit_str_max_collapse_sim_df%>%
  filter(info%in%common_hotspot_site_info)


p_common_hotspot_structure<-common_hotspot_ss_df%>%filter(TCN=="TCN")%>%#filter(`Cytosine position in loop`==0)%>%
  ggplot(aes(x=`Cytosine position in loop`,col="black"))+
  geom_histogram(stat="count")+
  facet_wrap(~`Loop size`)+
  theme_bw()
p_common_hotspot_structure
ggsave("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/sample/common_hotspot_TCN_structure.pdf",p_common_hotspot_structure,
       height=9,width=10)

common_hotspot_ss_df%>%arrange(`Cytosine position in loop`)%>%filter(!is.na(Stem1))%>%arrange(`Loop size`)%>%select(-VEP_region)%>%select(-`Cytosine position`)
common_all_positions <- common_hotspot_ss_df %>%filter(`Loop size`!=0)%>%
  filter(TCN == "TCN") %>%
  # Get the unique loop sizes
  distinct(`Loop size`) %>%
  # For each loop size, create a sequence from 1 to that size
  rowwise() %>%
  mutate(
    pos_list = list(1:`Loop size`)
  ) %>%
  unnest(pos_list) %>%
  rename(`Cytosine position in loop` = pos_list)

# Calculate counts from the original data, ensuring consistent types
common_counts_data <- common_hotspot_ss_df %>%filter(`Loop size`!=0)%>%
  filter(TCN == "TCN") %>%
  # Convert to character first to ensure consistent types
  mutate(`Cytosine position in loop` = as.integer(as.character(`Cytosine position in loop`))) %>%
  group_by(`Loop size`, `Cytosine position in loop`) %>%
  summarize(count = n(), .groups = "drop")

# Join both datasets

common_complete_data <- common_all_positions %>%
  left_join(
    common_counts_data,
    by = c("Loop size", "Cytosine position in loop")
  ) %>%
  # Replace NA counts with 0
  mutate(count = ifelse(is.na(count), 0, count))
common_complete_data%>%filter(`Loop size`==0)
common_complete_data_excl<-common_complete_data%>%filter(`Loop size`!=0)
common_complete_data_excl$`Loop size`<-factor(common_complete_data_excl$`Loop size`,levels=c(1:11))
common_complete_data_excl$`Cytosine position in loop`<-factor(common_complete_data_excl$`Cytosine position in loop`,levels=c(1:11))

common_complete_data_excl<-common_complete_data_excl%>%filter(!is.na(`Cytosine position in loop`))
# Now plot
p_common_hotspot_structure_excl<-ggplot(common_complete_data_excl, aes(x = factor(`Cytosine position in loop`), y = count)) +
  geom_bar(stat = "identity", fill = "steelblue",col="black") +
  facet_grid(. ~ `Loop size`, switch = "x", scales = "free_x", space = "free_x") +
  scale_y_continuous(expand = expansion(mult = c(0, NA)),
                     limits=c(0,1500),
                     breaks=c(0,500,1000,1500),
                     labels=c(0,500,1000,1500)) +
  theme_bw() +
  theme(
    #panel.border = element_blank(),
    axis.line = element_line(color = "black"),
    strip.placement = "outside",
    strip.text.x = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
    panel.spacing = unit(1, "lines")
  ) +
  labs(
    x = "Cytosine position in loop",
    y = "Count"
  )

p_common_hotspot_structure_excl
ggsave("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/sample/common_hotspot_TCN_structure.excl.pdf",p_common_hotspot_structure_excl,
       height=3,width=10)




common_hotspot_ss_df%>%filter(TCN=="TCN")%>%
  ggplot(aes(x=`Cytosine position in loop`))+
  geom_histogram(stat="count")+
  facet_wrap(~`Loop size`)

##save table
rna_editing_CtoT_sim_df<-rna_editing_df%>%mutate(info=paste(`CHROM`,POS,sep="_"))%>%
  filter(grepl("C>T",DNA_sig_cont))%>%
  filter(new_id%in%depth_fin_df$new_id)%>%
  #filter(info%in%(A3A_hotspot_ss_df%>%filter(TCN=="TCN"))$info)%>%
  select(new_id,APOBEC,time,dose,batch,rep,CHROM,POS,REF,ALT,DNA_sig_cont,gene_dir,var_readc,depth,vaf,align_dir,rescue,Func_refGene,Gene_refGene,ExonicFunc_refGene,info)%>%
  plyr::rename(c("CHROM"="#CHROM"))

rna_editing_str_df<-left_join(rna_editing_CtoT_sim_df,redit_str_max_collapse_sim_df)
rna_editing_str_df<-rna_editing_str_df%>%mutate(hotspot=ifelse(info%in%A3A_hotspot_ss_df$info,"A3A_only",
                                                               ifelse(info%in%A3B_hotspot_ss_df$info,"A3B_only",
                                                                      ifelse(info%in%common_hotspot_ss_df$info,"common","NA"))))
rna_editing_str_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/08_2nd_revision/RNA_editing/sample/rna_editing_str_hotspot.fin.txt",
              sep="\t",
              quote=F,
              row.names=F)

###stat
rna_editing_filter_sample_CtoT_df$info
rna_editing_filter_sample_CtoT_df%>%mutate(info=gsub("_[CG]_.*","",info))%>%
  filter(!info%in%redit_str_max_collapse_df$info)%>%select(id,new_id,CHROM,POS,REF,ALT)
##check out_df##
out_df<-rna_editing_filter_sample_CtoT_df%>%mutate(info=gsub("_[CG]_.*","",info))%>%
  filter(!info%in%redit_str_max_collapse_df$info)%>%select(id,new_id,CHROM,POS,REF,ALT,info)
ori_str_df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.txt")%>%
  mutate(info=paste(`#CHROM`,POS,sep="_"))

ori_str_df%>%filter(info%in%out_df$info)%>%
  group_by(extracted_seq)%>%
  dplyr:::summarise(n=n())


df1<-rna_editing_filter_sample_CtoT_df%>%mutate(info=gsub("_[CG]_.*","",info))
df2<-redit_str_max_collapse_df

merge_df<-left_join(df1,df2%>%plyr::rename(c("#CHROM"="CHROM")))
merge_df%>%filter(info%in%A3A_uniq_hotspot_site_info)
merge_df<-merge_df%>%mutate(hotspot=ifelse(info%in%A3A_uniq_hotspot_site_info,"A3A_only",
                                           ifelse(info%in%A3B_uniq_hotspot_site_info,"A3B_only",
                                                  ifelse(info%in%common_hotspot_site_info,"common","NA"))))%>%
  mutate(vaf=cor_var_readc/cor_depth)
colnames(merge_df)

merge_sim_df<-merge_df%>%mutate(var_readc=cor_var_readc*depth_ratio,depth=cor_depth*depth_ratio)%>%
  select(id,new_id,CHROM,POS,REF,ALT,gene_dir,align_dir,var_readc,depth,vaf,hotspot,Func_refGene,Gene_refGene,ExonicFunc_refGene,VEP_region,vep_gene_name,vep_gene_id,vep_transcript_id,sig_cont,Stem1,Loop,Stem2,`Stem length`,`Stem strength`,`Loop size`,`Cytosine position in loop`,GC_count_in_stem,TCN)
merge_sim_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_secondary_structure.CtoT.txt",
              sep="\t",
              quote=F,
              row.names=F)
colnames(merge_sim_df)
redit_hotspot_stat_df<-merge_sim_df%>%
  dplyr::filter(hotspot%in%c("A3A_only","A3B_only","common"))%>%
  select(-c(1:2))%>%select(-depth,var_readc,vaf)%>%unique()%>%
  group_by(hotspot,`Loop size`,`Cytosine position in loop`)%>%
  dplyr::summarise(n=n())
redit_hotspot_stat_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_secondary_structure.hotspot.CtoT.stat.txt",
              sep="\t",
              quote=F,
              row.names=F)
