library(dplyr)
library(tidyverse)
library(ggplot2)

ref_R_TCN_len<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/rep_dir/rep_time_bin4.cut25.dir.R.TCN.txt",
                        header=F)
ref_R_NGA_len<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/rep_dir/rep_time_bin4.cut25.dir.R.NGA.txt",
                        header=F)
ref_L_TCN_len<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/rep_dir/rep_time_bin4.cut25.dir.L.TCN.txt",
                        header=F)
ref_L_NGA_len<-read.csv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/rep_dir/rep_time_bin4.cut25.dir.L.NGA.txt",
                        header=F)
ref_len_df<-data.frame(
  "lagging"=ref_R_TCN_len$V1+
    ref_L_NGA_len$V1
  ,
  "leading"=
    ref_R_NGA_len$V1+
    ref_L_TCN_len$V1
)%>%
  mutate(lagging=lagging*2,
         leading=leading*2)


dir_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/05_vcf/annotation_total/bin5_tot/APOBEC_epi_merge.A3A.spread.tsv")%>%
  select(`#CHROM`,r2,id,rep_dir)%>%
  plyr::rename(c("r2"="POS"))

dir_df%>%filter(r2==7253249)
dir_df%>%filter(!is.na(rep_dir))

dir_df%>%arrange(id,`#CHROM`,POS)
mut_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/v3/A3A.total.vcf")%>%
  mutate(mut_type=ifelse(!ALT%in%c("A","C","G","T"),substr(sig_cont,2,4),mut_type))

mut_df%>%filter(id=="A3A_C3_TP53_C3_100ng-1")
merge_df<-left_join(mut_df,dir_df)%>%filter(rep_dir!=".")

merge_df<-merge_df%>%mutate(dose=ifelse(grepl("_3",id),"3ug/ml",
                                        ifelse(grepl("_100",id),"0.1ug/ml","CTRL")))%>%
  mutate(APOBEC=ifelse(grepl("A3A",id),"A3A","A3B"))%>%
  mutate(TP53=ifelse(grepl("TP53",id),"KO","WT"))

merge_df$ALT%>%unique()
merge_df<-merge_df%>%mutate(rep_std=ifelse(REF=="C"&rep_dir=="R","lagging",
                                           ifelse(REF=="C"&rep_dir=="L","leading",
                                                  ifelse(REF=="G"&rep_dir=="L","lagging",
                                                         ifelse(REF=="G"&rep_dir=="R","leading","NA")))))

merge_df$rep_std<-factor(merge_df$rep_std,levels=c("leading","lagging"))
merge_df$id%>%unique()
merge_df%>%group_by(id,TP53,rep_std)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=leading/ref_len_df$leading,
         lagging_rate=lagging/ref_len_df$lagging)%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/data/rep_std/rep_std_sample.txt",row.names=F,quote=F,sep="\t")


merge_df%>%group_by(id,TP53,rep_std)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=leading/ref_len_df$leading,
         lagging_rate=lagging/ref_len_df$lagging)%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))%>%
  select(id,TP53,leading_ratio,lagging_ratio)%>%
  gather(class,ratio,3:4)%>%
  group_by(TP53)%>%
  rstatix::t_test(ratio~class)

#TP53  .y.   group1        group2           n1    n2 statistic    df        p
#* <fct> <chr> <chr>         <chr>         <int> <int>     <dbl> <dbl>    <dbl>
#1 WT    ratio lagging_ratio leading_ratio    10    10      12.6    18 2.34e-10
#2 KO    ratio lagging_ratio leading_ratio     6     6      42.3    10 1.3 e-12  



merge_df%>%group_by(id,TP53,rep_std,mut_type)%>%
  dplyr::summarise(n=n())%>%
  spread(mut_type,n)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`),
         `C>G_ratio`=`C>G`/(`C>T`+`C>G`))%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/data/rep_std/rep_std_sample.ratio.mut_type.txt",row.names=F,quote=F,sep="\t")
  


merge_df%>%group_by(id,TP53,mut_type,rep_std)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=leading/ref_len_df$leading,
         lagging_rate=lagging/ref_len_df$lagging)%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/data/rep_std/rep_std_sample.ratio.mut_type.region.txt",row.names=F,quote=F,sep="\t")


merge_df%>%group_by(TP53,rep_std)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=ifelse(TP53=="WT",leading/ref_len_df$leading/10,leading/ref_len_df$leading/6),
         lagging_rate=ifelse(TP53=="WT",lagging/ref_len_df$lagging/10,lagging/ref_len_df$lagging/6))%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/data/rep_std/rep_std_TP53.txt",row.names=F,quote=F,sep="\t")



merge_df%>%group_by(TP53,rep_std,mut_type)%>%
  dplyr::summarise(n=n())%>%
  spread(mut_type,n)%>%
  mutate(`C>T_ratio`=`C>T`/(`C>T`+`C>G`),
         `C>G_ratio`=`C>G`/(`C>T`+`C>G`))%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/data/rep_std/rep_TP53.ratio.mut_type.txt",row.names=F,quote=F,sep="\t")


merge_df%>%group_by(TP53,mut_type,rep_std)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=ifelse(TP53=="WT",leading/ref_len_df$leading/10,leading/ref_len_df$leading/6),
         lagging_rate=ifelse(TP53=="WT",lagging/ref_len_df$lagging/10,lagging/ref_len_df$lagging/6))%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/data/rep_std/rep_std_TP53.ratio.mut_type.region.txt",row.names=F,quote=F,sep="\t")

merge_df%>%group_by(rep_std)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=ifelse(TP53=="WT",leading/ref_len_df$leading/10,leading/ref_len_df$leading/6),
         lagging_rate=ifelse(TP53=="WT",lagging/ref_len_df$lagging/10,lagging/ref_len_df$lagging/6))%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))

merge_df%>%group_by(mut_type,rep_std)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=ifelse(TP53=="WT",leading/ref_len_df$leading/10,leading/ref_len_df$leading/6),
         lagging_rate=ifelse(TP53=="WT",lagging/ref_len_df$lagging/10,lagging/ref_len_df$lagging/6))%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))


merge_df%>%group_by(id,TP53,rep_std)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=leading/ref_len_df$leading,
         lagging_rate=lagging/ref_len_df$lagging)%>%
  ungroup()%>%
  select(id,TP53,leading_rate,lagging_rate)%>%
  gather(class,rate,3:4)%>%
  mutate(class=gsub("_rate","",class))%>%
  group_by(TP53)%>%
  #filter(TP53=="WT")%>%
  rstatix::t_test(rate~class)


#.y.   group1  group2     n1    n2 statistic    df     p
#* <chr> <chr>   <chr>   <int> <int>     <dbl> <dbl> <dbl>
#1 rate  lagging leading    10    10      1.44  12.0 0.176

merge_rep_std_rate_ci_df<-
  merge_df%>%group_by(id,TP53,rep_std)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=leading/ref_len_df$leading,
         lagging_rate=lagging/ref_len_df$lagging)%>%
  ungroup()%>%
  select(id,TP53,leading_rate,lagging_rate)%>%
  gather(class,rate,3:4)%>%
  mutate(class=gsub("_rate","",class))%>%
  group_by(TP53,class)%>%
  summarise(mean.rate = mean(rate, na.rm = TRUE),
            sd.rate = sd(rate, na.rm = TRUE),
            n.rate = n()) %>%
  mutate(se.rate = sd.rate / sqrt(n.rate),
         lower.ci.rate = mean.rate - qt(1 - (0.05 / 2), n.rate - 1) * se.rate,
         upper.ci.rate = mean.rate + qt(1 - (0.05 / 2), n.rate - 1) * se.rate)
merge_rep_std_rate_ci_df$TP53<-factor(merge_rep_std_rate_ci_df$TP53,level=c("WT","KO"))
merge_rep_std_rate_ci_df$class<-factor(merge_rep_std_rate_ci_df$class,levels=c("leading","lagging"))
merge_rep_std_rate_ci_df%>%
  ggplot(aes(x=class,y=mean.rate*1000000))+
  geom_bar(stat="identity")+
  geom_errorbar(aes(y=mean.rate*1000000,ymax=upper.ci.rate*1000000,ymin=lower.ci.rate*1000000))+
  facet_wrap(~TP53)


p_rep_std_sample<-merge_rep_std_rate_ci_df%>%
  filter(TP53=="WT")%>%
  ggplot(aes(x=class,y=mean.rate*1000000,col="black",fill=class))+
  geom_bar(stat="identity")+
  geom_errorbar(aes(y=mean.rate*1000000,ymax=upper.ci.rate*1000000,ymin=lower.ci.rate*1000000),width=.5)+
  #facet_wrap(~TP53)+
  theme_classic()+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(size=30),
        #plot.title=element_text(size=30),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks = element_line(size = 4),
        legend.position="none"
  )+
  scale_colour_manual(values=c("black"))+
  scale_y_continuous(lim=c(0,15))
p_rep_std_sample
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/rep_std.sample.pdf",p_rep_std_sample,
       height=10,width=8)

avg_rate_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/01_refernce/fin/v3/mappable/average_mut_rate_TCN_NGA.region.txt")
#p_rep_std_TP53<-
merge_df%>%group_by(TP53,rep_std)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=ifelse(TP53=="WT",leading/ref_len_df$leading/10,leading/ref_len_df$leading/6),
         lagging_rate=ifelse(TP53=="WT",lagging/ref_len_df$lagging/10,lagging/ref_len_df$lagging/6))%>%
  filter(TP53=="WT")%>%
  gather(class,rate,4:5)
  
  
  
  
  
  p_rep_std<-merge_df%>%group_by(TP53,rep_std)%>%select(-id)%>%unique()%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=ifelse(TP53=="WT",leading/ref_len_df$leading/10,leading/ref_len_df$leading/6),
         lagging_rate=ifelse(TP53=="WT",lagging/ref_len_df$lagging/10,lagging/ref_len_df$lagging/6))%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))%>%
  filter(TP53=="WT")%>%
  gather(region,rate,leading_rate:lagging_rate)%>%
  ggplot(aes(x=region,y=rate*1000000,fill=region,col=TP53))+
  geom_bar(stat="identity")+
  #ggtitle("WT_rep.strand")+
  theme_classic()+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(size=30),
        #plot.title=element_text(size=30),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks = element_line(size = 4),
        legend.position="none"
  )+
  scale_colour_manual(values=c("black"))+
  scale_y_continuous(lim=c(0,3))
p_rep_std
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/rep_std.pdf",p_rep_std,
       height=10,width=8)



p_rep_std_v2<-merge_df%>%group_by(TP53,rep_std)%>%select(-id)%>%unique()%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=ifelse(TP53=="WT",leading/ref_len_df$leading/10,leading/ref_len_df$leading/6),
         lagging_rate=ifelse(TP53=="WT",lagging/ref_len_df$lagging/10,lagging/ref_len_df$lagging/6))%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))%>%
  filter(TP53=="WT")%>%
  gather(region,rate,leading_rate:lagging_rate)%>%
  ggplot(aes(x=region,y=rate*1000000,fill=region,col=TP53))+
  geom_bar(stat="identity")+
  #ggtitle("WT_rep.strand")+
  theme_classic()+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(size=30),
        #plot.title=element_text(size=30),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks = element_line(size = 4),
        legend.position="none"
  )+
  scale_colour_manual(values=c("black"))+
  scale_y_continuous(lim=c(0,3.2))+
  geom_hline(yintercept=(avg_rate_df%>%filter(class=="mappable")%>%filter(TP53=="WT"))$mut_rate*1000000,linetype="dashed")
p_rep_std_v2
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/rep_std.v2.pdf",p_rep_std_v2,
       height=10,width=8)


p_rep_std


merge_df%>%group_by(TP53,rep_std)%>%select(-id)%>%unique()%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=ifelse(TP53=="WT",leading/ref_len_df$leading/10,leading/ref_len_df$leading/6),
         lagging_rate=ifelse(TP53=="WT",lagging/ref_len_df$lagging/10,lagging/ref_len_df$lagging/6))%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))%>%
  filter(TP53=="WT")%>%
  gather(region,rate,leading_rate:lagging_rate)%>%select(leading,lagging)%>%unique()

#c(1483,3438)
ref_len_df
rstatix::chisq_test(c(1483,3438),p=c(ref_len_df$leading/(ref_len_df$leading+ref_len_df$lagging),ref_len_df$lagging/(ref_len_df$lagging+ref_len_df$leading)))%>%
  write.table(
    "/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/data/rep_std/rep_std.chisq.pval.txt",
    quote=F,
    row.names=F,
    sep="\t")

prop.table(ref_len_df[1,])
#c(0.5023939,0.4976061)
chisq.test(c(1483,3438),p=c(0.5023939,0.4976061))$p.value
#[1] 5.062855e-175
write.table(chisq.test(c(1483,3438),p=c(0.5023939,0.4976061))$p.value,
            "/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/data/rep_std/rep_std.chisq.pval.txt",
            quote=F,
            row.names=F,
            sep="\t")

merge_df$TP53<-factor(merge_df$TP53,levels=c("WT","KO"))
p2<-merge_df%>%group_by(id,TP53,rep_std)%>%
  filter(TP53=="WT")%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=leading/ref_len_df$leading,
         lagging_rate=lagging/ref_len_df$lagging)%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))%>%
  select(id,TP53,leading_ratio,lagging_ratio)%>%
  gather(class,ratio,leading_ratio:lagging_ratio)%>%
  group_by(TP53,class)%>%
  summarise(mean.ratio = mean(ratio, na.rm = TRUE),
            sd.ratio = sd(ratio, na.rm = TRUE),
            n.ratio = n()) %>%
  mutate(se.ratio = sd.ratio / sqrt(n.ratio),
         lower.ci.ratio = mean.ratio - qt(1 - (0.05 / 2), n.ratio - 1) * se.ratio,
         upper.ci.ratio = mean.ratio + qt(1 - (0.05 / 2), n.ratio - 1) * se.ratio)%>%
  ggplot(aes(x=class,y=mean.ratio,col="black",fill=class))+
  geom_bar(stat="identity")+
  geom_errorbar(aes(y=mean.ratio,ymax=upper.ci.ratio,ymin=lower.ci.ratio),width=.5)+
  #  facet_wrap(~TP53)+
  theme_classic()+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(size=30),
        #plot.title=element_text(size=30),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks = element_line(size = 4),
        legend.position="none"
  )+
  scale_colour_manual(values=c("black"))+
  scale_y_continuous(lim=c(0,1))
p2
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/240222/rep_std.ratio.pdf",p2,
       height=10,width=8)

merge_f






merge_df%>%group_by(id,TP53,rep_std,mut_type)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=leading/ref_len_df$leading,
         lagging_rate=lagging/ref_len_df$lagging)%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))%>%
  print(n=100)



merge_df%>%group_by(TP53,rep_std,mut_type)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=leading/ref_len_df$leading,
         lagging_rate=lagging/ref_len_df$lagging)%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))



merge_df%>%group_by(rep_std,mut_type)%>%
  dplyr::summarise(n=n())%>%
  spread(rep_std,n)%>%
  mutate(leading_rate=leading/ref_len_df$leading,
         lagging_rate=lagging/ref_len_df$lagging)%>%
  mutate(leading_ratio=leading_rate/(leading_rate+lagging_rate),
         lagging_ratio=lagging_rate/(leading_rate+lagging_rate))

merge_df%>%filter()

