library(dplyr)
library(tidyverse)
library(ggplot2)
library(stringr)
redit_seq_df%>%
  select(id,new_id)%>%unique()
redit_seq_df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/sample/RNA_editing_target_sample_ctot.41bp.30.txt")%>%
  mutate(POS_C=start-start_pos+1)

redit_str_df%>%colnames()

nchar("CCGCCTGCCAGTGGCCTCTTCAGGCCCATGGGGCTCATTCC")
redit_seq_df<-redit_seq_df%>%
  select(`#CHROM`,POS,REF,ALT,`POS_C`,extracted_seq,gene_dir,id,new_id)%>%
  mutate(APOBEC=ifelse(grepl("A3A",new_id),"A3A","A3B"))%>%
  mutate(cond=ifelse(grepl("_3",new_id),"3ug",
                     ifelse(grepl("0h",new_id),"CTRL","100ng")))
redit_seq_df<-redit_seq_df%>%
  #unique()%>%
  filter(!(`POS_C`-2<1 | `POS_C`+2>nchar(`extracted_seq`)))%>%
  mutate(five_sig_cont=substr(`extracted_seq`,`POS_C`-2,`POS_C`+2))%>%

  mutate(mut_type=ifelse(gene_dir=="+",paste0(REF,">",ALT),
                         paste0(chartr("ACGT","TGCA",REF),">",chartr("ACGT","TGCA",ALT))))%>%
  mutate(five_sig_cont=paste0(substr(`extracted_seq`,`POS_C`-2,`POS_C`-1),mut_type,substr(`extracted_seq`,`POS_C`+1,`POS_C`+2)))


redit_seq_df
fin_df%>%select(CHROM,POS,REF,ALT,gene_dir,three_bp_cont,sig_cont,five_bp_cont,five_sig_cont)
redit_seq_df



contextorder96 <- paste0(
  rep(rep(c("A","C","G","T"),each=4),4), #firstbase
  rep(c("C","T"),each=48),
  ">",
  rep(c("A","G","T","A","C","G"), each=16),
  rep(c("A","C","G","T"),16) # lastbase
)

contextorder192 <- c(contextorder96,
                     paste0(
                       c("G"="C","C"="G","A"="T","T"="A")[substr(contextorder96,5,5)],
                       c("G"="C","C"="G","A"="T","T"="A")[substr(contextorder96,2,2)],
                       ">",
                       c("G"="C","C"="G","A"="T","T"="A")[substr(contextorder96,4,4)],
                       c("G"="C","C"="G","A"="T","T"="A")[substr(contextorder96,1,1)]
                     )
)

contextorder1536<-paste0(
  rep(rep(c("A","C","G","T"),each=64),3), #firstbase
  rep(rep(rep(c("A","C","G","T"),each=16),4),3),
  rep(c("C","T"),each=768),
  ">",
  rep(c("A","G","T","A","C","G"), each=256),
  rep(c("A","C","G","T"),each=4,64),
  rep(c("A","C","G","T"),192)
)

redit_seq_df<-redit_seq_df%>%mutate(five_sig_cont=factor(five_sig_cont,levels=contextorder1536))
redit_seq_df$mut_type%>%unique()
redit_seq_df%>%group_by(id,new_id,APOBEC,cond,five_sig_cont)

std_df
contextorder1536_CtoT<-contextorder1536[grepl("C>T",contextorder1536)]
std_df<-data.frame(five_sig_cont=rep(contextorder1536_CtoT,6),APOBEC=rep(c("A3A","A3B"),each=1536*3),cond=rep(rep(c("CTRL","100ng","3ug"),each=1536),2))%>%as.tibble()%>%unique()
redit_seq_sum_df<-left_join(std_df,redit_seq_df%>%
                              group_by(APOBEC,cond,five_sig_cont)%>%
                              dplyr::summarise(n=n()))


redit_seq_sum_df[is.na(redit_seq_sum_df)]<-0
redit_seq_sum_df<-redit_seq_sum_df%>%mutate(five_sig_cont=factor(five_sig_cont,levels=contextorder1536))%>%
  mutate(cond=factor(cond,levels=c("CTRL","100ng","3ug")))
#redit_seq_sum_df%>%
redit_seq_sum_df%>%
  ggplot(aes(x=five_sig_cont,y=n))+
  geom_bar(stat="identity")+
  facet_wrap(~APOBEC+cond)




redit_seq_sum_df%>%
  filter(APOBEC=="A3A",cond=="3ug")%>%
  arrange(-n)%>%unique()

redit_seq_sum_df%>%
  filter(APOBEC=="A3B",cond=="3ug")%>%
  arrange(-n)%>%unique()


##normalization
metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/metadata.txt")
sample_cont_df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/redit_expressed_gene.non_zero_tpm.context.merge.txt")%>%
  plyr::rename(c("id"="exp_id","sum_n"="back_sum_n"))%>%
  left_join(metadata)
(sample_cont_df%>%arrange(id))%>%select(id)%>%unique()
(redit_seq_sample_sum_df%>%arrange(id))%>%select(new_id,id)%>%unique()

redit_seq_sample_sum_df<-left_join(std_df,redit_seq_df%>%
                                     group_by(APOBEC,cond,five_sig_cont,new_id,id)%>%
                                     dplyr::summarise(n=n()))%>%
  mutate(five_sig_cont=gsub(">T","",five_sig_cont))%>%
  plyr::rename(c("five_sig_cont"="context"))

redit_seq_cont_fin_df<-left_join(redit_seq_sample_sum_df,
                                 sample_cont_df)%>%mutate(rate=n/back_sum_n)%>%
  mutate(group=factor(paste0(APOBEC,"_",cond),levels=c("A3A_CTRL","A3A_100ng","A3A_3ug","A3B_CTRL","A3B_100ng","A3B_3ug")))
redit_seq_cont_fin_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/redit_seq_cont_fin_df.txt",
              sep="\t",
              quote=F,
              row.names=F)

redit_seq_cont_fin_tcn_df<-redit_seq_cont_fin_df%>%
  filter(substr(context,2,3)=="TC")%>%
  mutate(rate=n/back_sum_n)

redit_seq_cont_fin_tcn_df%>%
  filter(context%in%c("ATCAA","CTCAA"))%>%
  select(-n,-back_sum_n)%>%
  spread(context,rate)%>%
  ggplot(aes(x=CTCAA,y=ATCAA,col=APOBEC,shape=cond))+
  geom_point(size=5)+
  ylim(c(0,0.02))+
  xlim(c(0,0.02))+
  geom_abline(slope=1)


redit_seq_cont_fin_prop_df<-left_join(redit_seq_cont_fin_df%>%filter(grepl("3ug",group))%>%
  group_by(group,APOBEC,cond,context)%>%
  dplyr::summarise(n=sum(n)),

  redit_seq_cont_fin_df%>%filter(grepl("3ug",group))%>%
  group_by(group,APOBEC,cond)%>%
  dplyr::summarise(tot_n=sum(n))
)%>%mutate(proportion=n/tot_n)

redit_seq_cont_fin_df%>%filter(grepl("3ug",group))%>%
  group_by(group,APOBEC,cond,context)%>%
  dplyr::summarise(n=sum(n))%>%
  spread(context,n)%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/five_cont/redit_five_cont_count.txt",
              sep="\t",
              quote=F,
              row.names=F)

redit_seq_cont_fin_prop_df%>%select(APOBEC,context,proportion)%>%ungroup()%>%
  spread(context,proportion)%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/five_cont/redit_five_cont_proportion.txt",
              sep="\t",
              quote=F,
              row.names=F)

redit_seq_cont_fin_prop_df%>%
  ggplot(aes(x=context,y=proportion,fill=APOBEC))+
  geom_bar(stat="identity",position="dodge")



df <- read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/five_cont/redit_five_cont_proportion.txt")

# Reshape from wide to long format
df_long <- df %>%
  pivot_longer(
    cols = -c(group, cond, APOBEC),
    names_to = "context",
    values_to = "proportion"
  ) %>%
  mutate(
    subcontext = str_sub(context, 2, 3),
    context_group = case_when(
      subcontext == "AC" ~ "ACN",
      subcontext == "CC" ~ "CCN",
      subcontext == "GC" ~ "GCN",
      subcontext == "TC" ~ "TCN",
      TRUE ~ NA_character_
    ),
    left_base = str_sub(context, 1, 1)
  ) %>%
  filter(context_group %in% c("ACN", "CCN", "GCN", "TCN"))

# Split into two groups
df1 <- df_long %>% filter(context_group %in% c("ACN", "CCN"))
df2 <- df_long %>% filter(context_group %in% c("GCN", "TCN"))

# Function to generate context factor with spacing for visual separation
generate_context_order <- function(df) {
  df %>%
    arrange(context_group, context) %>%
    mutate(
      context_f = factor(context, levels = unique(context)),
      pos = as.numeric(factor(context, levels = unique(context))),
      group_change = left_base != lag(left_base, default = first(left_base)),
      vline_x = ifelse(group_change, pos - 0.5, NA)
    )
}

df1 <- generate_context_order(df1)
df2 <- generate_context_order(df2)

# Shared y limit
y_limit <- max(c(df1$proportion, df2$proportion))

# Plotting function
make_plot <- function(data, title) {
  vlines <- unique(na.omit(data$vline_x))

  ggplot(data, aes(x = context_f, y = proportion, fill = APOBEC,col="black")) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_vline(xintercept = vlines, linetype = "dotted", color = "gray40") +
    scale_fill_manual(values = c("A3A" = "tomato", "A3B" = "skyblue")) +
    scale_y_continuous(limits = c(0, 0.042), expand = c(0, 0)) +
    scale_x_discrete(expand = c(0, 0))+
    theme_bw() +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
      panel.grid = element_blank(),
      plot.title = element_text(hjust = 0.5)
    ) +
    labs(x = "Context", y = "Proportion", title = title)

}

# Make the plots
p1 <- make_plot(df1, "ACN & CCN")
p2 <- make_plot(df2, "GCN & TCN")
p1
p2
ggsave("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/five_cont/ACN_CCN.pdf",p1,
       height=8,width=10)
ggsave("/home/users/ayh/Projects/27_A3B/07_revision/RNA_editing/five_cont/GCN_TCN.pdf",p2,
       height=8,width=10)
# Display together
gridExtra::grid.arrange(p1, p2, ncol = 1)





redit_seq_cont_fin_tcn_df<-redit_seq_cont_fin_tcn_df%>%
  mutate(group=factor(paste0(APOBEC,"_",cond),levels=c("A3A_CTRL","A3A_100ng","A3A_3ug","A3B_CTRL","A3B_100ng","A3B_3ug")))

redit_seq_cont_fin_tcn_df$id%>%unique()
redit_seq_cont_fin_tcn_df%>%
  filter(APOBEC=="A3A")%>%
  ggplot(aes(x=group,y=rate))+
  geom_boxplot()+
  facet_wrap(~context)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  ylim(c(0,0.023))+
  ggtitle("A3A_redit")

redit_seq_cont_fin_tcn_df%>%
  filter(APOBEC=="A3B")%>%
  filter(id=="A3B_1st_C5_3ng_48h_1")%>%
  arrange(-rate)
redit_seq_cont_fin_tcn_df%>%
  filter(APOBEC=="A3A")%>%
  filter(id=="A3A_1st_C3_48h_3ug-1")%>%
  arrange(-rate)

redit_seq_cont_fin_tcn_df%>%
  filter(APOBEC=="A3B")%>%
  ggplot(aes(x=group,y=rate))+
  geom_boxplot()+
  facet_wrap(~context)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  ylim(c(0,0.023))+
  ggtitle("A3B_redit")
