library(stringr)
library(dplyr)
library(tidyverse)

metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")


cl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/v2/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")

cl_df%>%dplyr::select(samples,imd_cutoff)%>%unique()%>%arrange(imd_cutoff)
files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
                          ".excl.*gd$",
                          full.names=T)
#files_to_read


merge_df<-do.call(rbind,vcf_tmp)
merge_df<-merge_df%>%filter(grepl("[ATGC][ATGC]>[ATGC][ATGC]",sig_cont))

merge_df<-left_join(merge_df,metadata%>%select(-`m/d`))


merge_df<-left_join(merge_df,cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)%>%plyr::rename(c("chr"="#CHROM","start"="POS","samples"="id")))
merge_df<-merge_df%>%mutate(IMD=ifelse(is.na(IMD),0,IMD),cluster_id=ifelse(is.na(cluster_id),".",cluster_id),cluster=ifelse(is.na(cluster_type_omikli_upto_3),"non-clust",cluster_type_omikli_upto_3))%>%select(-cluster_type_omikli_upto_3)

A3A_cl_merge_df<-merge_df%>%filter(dose%in%c("3ug","100ng"))%>%filter(APOBEC=="A3A")

A3A_cl_merge_df<-A3A_cl_merge_df%>%mutate(APOBEC="A3A")
A3A_cl_merge_f_df<-left_join(
  A3A_cl_merge_df,
  A3A_cl_merge_df%>%group_by(id,`#CHROM`,APOBEC,dose,TP53,cluster_id,cluster)%>%dplyr::summarise(AMS=sum(grepl("TC>[GT][AT]",sig_cont)))%>%filter(cluster!="non-clust")
)%>%filter(AMS>=2)

A3A_cl_merge_f_df<-left_join(A3A_cl_merge_f_df,A3A_cl_merge_f_df%>%group_by(id,`#CHROM`,cluster_id,cluster)%>%dplyr::summarise(no_snv=n()))

excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/exclude_cl_mut_df.v2.txt")

miss_phs_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/miss_phased_cluster.txt")%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))

#miss_phs_excl_df<-miss_phs_excl_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))

A3A_cl_merge_f_df<-A3A_cl_merge_f_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))

A3A_cl_merge_f_excl_df<-A3A_cl_merge_f_df%>%filter(!info%in%excl_df$info)%>%filter(!info%in%miss_phs_excl_df$info)



cont_sum_df<-A3A_cl_merge_f_excl_df%>%filter(cluster=="kataegis")%>%unique()%>%
  mutate(TCN=ifelse(grepl("TC>",sig_cont),1,0),
         VCN=ifelse(grepl("[AGC]C>",sig_cont),1,0),
         nonC=ifelse(!grepl("C>",sig_cont),1,0))%>%
  group_by(info)%>%
  dplyr::summarise(tot_TCN=sum(TCN),
                   tot_VCN=sum(VCN),
                   tot_nonC=sum(nonC))

VCN_df<-A3A_cl_merge_f_excl_df%>%filter(cluster=="kataegis")%>%unique()%>%
  mutate(TCN=ifelse(grepl("TC>",sig_cont),1,0),
         VCN=ifelse(grepl("[AGC]C>",sig_cont),1,0),
         nonC=ifelse(!grepl("C>",sig_cont),1,0))%>%
  group_by(info)%>%
  dplyr::summarise(tot_TCN=sum(TCN),
                   tot_VCN=sum(VCN),
                   tot_nonC=sum(nonC))



##exclude cluster
excl_info<-c("A3A_1st_C3_3ug-2_10_10","A3A_C3_TP53_C3_100ng-3_10_2","A3A_C3_TP53_C3_3ug-2_4_8")

cut_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/v2/imds/APOBEC_clustered_mutations_imd_cutoff.tsv")%>%
  plyr::rename(c("Sample"="id"))
range_df<-A3A_cl_merge_f_excl_df%>%filter(cluster=="kataegis")%>%unique()%>%
  group_by(id,`#CHROM`,cluster_id,info)%>%
  dplyr::summarise(min_pos=min(POS),max_pos=max(POS))

## edit range_df


library(Rsamtools)
fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')
gene_gr<-GRanges(seqnames=range_df$`#CHROM`,IRanges(start=(as.numeric(range_df$min_pos)-1), end=(as.numeric(range_df$max_pos)+1)),strand="+")
gene_seq<-getSeq(fasta_file,reduce(gene_gr))

pos_cont<-paste0(rep(rep(c("A","C","G","T"),each=4),2),rep(c("C","T"),each=16),rep(c("A","C","G","T"),8))
library(stringr)
library(stringi)
rev_cont<-stri_reverse(chartr("ACGT","TGCA",pos_cont))

cont_c<-c(pos_cont,rev_cont)

t_seq_df<-c("TCATGTAC")


t_la<-lapply(t_seq_df,function(x){
  tmp<-lapply(cont_c,function(i){
    str_count(x,i)%>%as.tibble()%>%mutate(sig_cont=i)
  })
  df<-do.call(rbind,tmp)
})





kat_cont_count_tmp<-lapply(as.data.frame(gene_seq)$x,function(x){
  tmp<-lapply(cont_c,function(i){
    str_count(x,i)%>%as.tibble()%>%mutate(sig_cont=i)
  })
  df<-do.call(rbind,tmp)
})

kat_cont_count_df<-do.call(rbind,kat_cont_count_tmp)
kat_cont_count_df$sig_cont<-factor(kat_cont_count_df$sig_cont,levels=cont_c)

kat_cont_count_df<-kat_cont_count_df%>%mutate(info=rep(range_df$info,each=64))
write.table(kat_cont_count_df,"/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/kat_cont_count_df.v2.txt",
            sep="\t",
            quote=F,
            row.name=F)

kat_cont_count_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/kat_cont_count_df.v2.txt")
kat_cont_count_sum_df<-kat_cont_count_df%>%
  group_by(sig_cont)%>%
  dplyr::summarise(n=sum(value))

library(BSgenome.Hsapiens.Ensembl.GRCh37.jolim)
c<-getSeq(BSgenome.Hsapiens.Ensembl.GRCh37.jolim,c(as.character(c(1:22)),"X"))
ref_cont_count_tmp<-lapply(cont_c,function(i){
  str_count(c,i)%>%as.tibble()%>%mutate(sig_cont=i)
  
})
ref_cont_count_df<-do.call(rbind,ref_cont_count_tmp)
write.table(ref_cont_count_df,"/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/ref_cont_count_df.v2.txt",
            sep="\t",
            quote=F,
            row.name=F)
ref_cont_count_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/ref_cont_count_df.v2.txt")

contextorder96 <- paste0(
  rep(rep(c("A","C","G","T"),each=4),4), #firstbase
  rep(c("C","T"),each=48),
  ">",
  rep(c("A","G","T","A","C","G"), each=16),
  rep(c("A","C","G","T"),16) # lastbase
)

ref_32_df<-data.frame(mut_sig_cont=contextorder96)%>%as.tibble()%>%
  mutate(sig_cont=paste0(substr(mut_sig_cont,1,2),substr(mut_sig_cont,5,5)))

kat_mut_count_df<-
  A3A_cl_merge_f_excl_df%>%filter(cluster=="kataegis")%>%unique()%>%
  plyr::rename(c("sig_cont"="mut_sig_cont"))%>%
  mutate(sig_cont=paste0(substr(mut_sig_cont,1,2),substr(mut_sig_cont,5,5)))%>%
  group_by(sig_cont,mut_sig_cont)%>%
  dplyr::summarise(kat_mut_count=n())


kat_mut_count_ref_df<-left_join(ref_32_df,kat_mut_count_df)
kat_mut_count_ref_df[is.na(kat_mut_count_ref_df)]<-0


kat_mut_count_ref_df<-left_join(kat_mut_count_ref_df,kat_cont_count_sum_df%>%plyr::rename(c("n"="kat_ref_cont_count")))



ref_cont_count_sum_df<-ref_cont_count_df%>%mutate(sig_cont=ifelse(substr(sig_cont,2,2)%in%c("C","T"),sig_cont,stri_reverse(chartr("ACGT","TGCA",sig_cont))))%>%
  group_by(sig_cont)%>%
  dplyr::summarise(tot_ref_cont_count=sum(value))


tot_mut_count_ref_df<-left_join(kat_mut_count_ref_df,ref_cont_count_sum_df)
#tot_mut_count_ref_df%>%filter(sig_cont=="ACA")

tot_mut_count_ref_count_df<-left_join(tot_mut_count_ref_df%>%mutate(kat_out_ref_cont_count=tot_ref_cont_count-kat_ref_cont_count),

merge_df%>%filter(id%in%(A3A_cl_merge_f_df$id%>%unique()))%>%
  group_by(sig_cont)%>%
  dplyr::summarise(tot_mut_count=n())%>%
  plyr::rename(c("sig_cont"="mut_sig_cont"))
)%>%mutate(kat_out_mut_count=tot_mut_count-kat_mut_count)
tot_mut_count_ref_count_df<-tot_mut_count_ref_count_df%>%mutate(kat_rate=kat_mut_count/kat_ref_cont_count,
          tot_rate=kat_out_mut_count/(kat_out_ref_cont_count*id_count))
tot_mut_count_ref_count_df%>%
write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/other_kat_sum_table.v2.txt",
            quote=F,
            row.name=F,
            sep="\t")
tot_mut_count_ref_count_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/other_kat_sum_table.v2.txt")
tot_mut_count_ref_count_df%>%
  select(mut_sig_cont,sig_cont,kat_mut_count)%>%
  mutate(Substitution=substr(mut_sig_cont,2,4))%>%
  select(Substitution,sig_cont,kat_mut_count)%>%
  plyr::rename(c("sig_cont"="Trinucleotide","kat_mut_count"="Count"))%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/kataegis_count_count.v2.txt",
              sep="\t",
              quote=F,
              row.names=F)

tot_mut_count_ref_count_df%>%
  select(mut_sig_cont,sig_cont,kat_mut_count)%>%
  mutate(Substitution=substr(mut_sig_cont,2,4))%>%
  select(Substitution,sig_cont,kat_mut_count)%>%
  plyr::rename(c("sig_cont"="Trinucleotide","kat_mut_count"="Count"))%>%
  mutate(Count=ifelse(grepl("^TC",Trinucleotide),0,Count))%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/kataegis_count_count.vcn_edit.v2.txt",
              sep="\t",
              quote=F,
              row.names=F)
  

tot_mut_count_ref_count_sum_df<-tot_mut_count_ref_count_df%>%
  select(mut_sig_cont,kat_rate,tot_rate)%>%
  gather(region,rate,kat_rate:tot_rate)
tot_mut_count_ref_count_sum_df$mut_sig_cont<-factor(tot_mut_count_ref_count_sum_df$mut_sig_cont,levels=contextorder96)




other_kat_input_df<-tot_mut_count_ref_count_sum_df%>%mutate(rate=rate*1000000)%>%
  mutate(Substitution=substr(mut_sig_cont,2,4),
         Trinucleotide=paste0(substr(mut_sig_cont,1,2),substr(mut_sig_cont,5,5)))%>%
  plyr::rename(c("rate"="count"))%>%
  mutate(type_3=rep(c(1:96),2))
library(cowplot)
palette.COSMIC.SNV.96<-c("#1EBFF0","#050708","#E62725","#CBCACB","#A1CF64","#EDC8C5")
col_c6<-c("#D97A36","#594C43")
other_kat_mag_input_df<-other_kat_input_df%>%mutate(count=ifelse(region=="kat_rate",ifelse(count!=0,4.2,0),count))
other_kat_mag_input_df$region<-factor(other_kat_mag_input_df$region,levels=c("tot_rate","kat_rate"))


h1<-ggplot(tot_mut_count_ref_count_df%>%
             mutate(Substitution=substr(mut_sig_cont,2,4))%>%
             plyr::rename(c("sig_cont"="Trinucleotides"))%>%
             mutate(type_3=c(1:96))
)+
  #geom_hline(yintercept=seq(0,150,by=50),
  #           linetype="dotted",col="grey")+
  geom_rect(aes(xmin = as.numeric(type_3) - 0.5, xmax = as.numeric(type_3) + 0.5, ymin = 0, ymax = 4.2,  fill = Substitution), alpha = 0.06)+
  geom_bar(mapping=aes(x=type_3,y=kat_out_mut_count,fill=Substitution),
           stat="identity",
           position="dodge",
           width=0.5)+
  #  guides(fill = guide_legend(ncol = 2))+
  theme(#axis.text.x.bottom = element_blank(),
    #axis.ticks.x = element_blank(),
    axis.ticks.y=element_line(size=1,colour="grey"),
    axis.ticks.length.y=unit(-0.25,"cm"),
    panel.spacing.x = unit(0, "mm"),
    axis.title.x = element_blank(),
    strip.background.x = element_blank(),
    strip.text.x = element_blank())+
  #facet_grid(.~Substitution, scales = "free_x")+
  theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1,size=20,family="Consolas"),
        axis.text.y=element_text(size=55),
        axis.title.y=element_text(size=60)
  )+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill=NA, size=1),
  )+
  scale_x_continuous(
    breaks = c(1:96),
    labels = contextorder96,
    expand = c(0.01, 0)
  )+
  scale_y_continuous(
    #limits=c(0,4.2),
    #expand=c(0,0),
    #breaks=seq(0,4,by=1),
    #labels=seq(0,4,by=1)
  )+
  xlab("")+ylab("")+
  #scale_fill_manual(values=col_c6)+
  theme(legend.position = "none")+
  scale_fill_manual(values=c(palette.COSMIC.SNV.96[1:3],col_c6[1],palette.COSMIC.SNV.96[4:6],col_c6[2]))+
  scale_colour_manual(values=c(col_c6))+
  
  ylab("# of SNVs")

#  scale_colour_manual(values=c(palette.COSMIC.SNV.96,col_c6))

h1
h2<-ggplot(other_kat_mag_input_df)+
  geom_bar(mapping = aes(x = type_3, y = 1, fill = Substitution),
           stat = "identity",
           width = 1)+
  theme_void()+
  theme(panel.spacing.x = unit(0, "mm"),)+
  facet_grid(.~Substitution, scales = "free_x")+
  scale_fill_manual(values=palette.COSMIC.SNV.96)+
  theme(legend.position = "none")+
  theme(strip.text.x=element_text(size=40))

h2
h1
legend <- plot_grid(get_legend(h2), get_legend(h1), ncol = 1)
h1 <- h1 + theme(legend.position = "none")
h2 <- h2 + theme(legend.position = "none")

plot_grid(h2, h1, align = "v", ncol = 1, axis = "tblr", rel_heights = c(0.5, 5))



h1<-ggplot(other_kat_mag_input_df)+
  #geom_hline(yintercept=seq(0,150,by=50),
  #           linetype="dotted",col="grey")+
  geom_rect(aes(xmin = as.numeric(type_3) - 0.5, xmax = as.numeric(type_3) + 0.5, ymin = 0, ymax = 4.2,  fill = Substitution), alpha = 0.06)+
  geom_bar(mapping=aes(x=type_3,y=count,fill=region),
           stat="identity",
           position="dodge",
           width=0.5)+
  #  guides(fill = guide_legend(ncol = 2))+
  theme(#axis.text.x.bottom = element_blank(),
    #axis.ticks.x = element_blank(),
    axis.ticks.y=element_line(size=1,colour="grey"),
    axis.ticks.length.y=unit(-0.25,"cm"),
    panel.spacing.x = unit(0, "mm"),
    axis.title.x = element_blank(),
    strip.background.x = element_blank(),
    strip.text.x = element_blank())+
  #facet_grid(.~Substitution, scales = "free_x")+
  theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1,size=20,family="Consolas"),
        axis.text.y=element_text(size=55),
        axis.title.y=element_text(size=60)
  )+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill=NA, size=1),
  )+
  scale_x_continuous(
    breaks = c(1:96),
    labels = contextorder96,
    expand = c(0.01, 0)
  )+
  scale_y_continuous(
    limits=c(0,4.2),
    expand=c(0,0),
    breaks=seq(0,4,by=1),
    labels=seq(0,4,by=1)
  )+
  xlab("")+ylab("")+
  #scale_fill_manual(values=col_c6)+
  theme(legend.position = "none")+
  scale_fill_manual(values=c(palette.COSMIC.SNV.96[1:3],col_c6[1],palette.COSMIC.SNV.96[4:6],col_c6[2]))+
  scale_colour_manual(values=c(col_c6))+
  
  ylab("# of SNVs")

#  scale_colour_manual(values=c(palette.COSMIC.SNV.96,col_c6))

h1
h2<-ggplot(other_kat_mag_input_df)+
  geom_bar(mapping = aes(x = type_3, y = 1, fill = Substitution),
           stat = "identity",
           width = 1)+
  theme_void()+
  theme(panel.spacing.x = unit(0, "mm"),)+
  facet_grid(.~Substitution, scales = "free_x")+
  scale_fill_manual(values=palette.COSMIC.SNV.96)+
  theme(legend.position = "none")+
  theme(strip.text.x=element_text(size=40))

h2

legend <- plot_grid(get_legend(h2), get_legend(h1), ncol = 1)
h1 <- h1 + theme(legend.position = "none")
h2 <- h2 + theme(legend.position = "none")

other_kat_rate_mag_plot<-plot_grid(h2, h1, align = "v", ncol = 1, axis = "tblr", rel_heights = c(0.5, 5))
other_kat_rate_mag_plot

save_plot("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/other_kat_rate_mag_96.pdf",other_kat_rate_mag_plot,
          ncol=1,
          nrow=2,
          base_asp=7,
          #unit="px",
          
          device=cairo_pdf)
other_kat_input_df$region<-factor(other_kat_mag_input_df$region,levels=c("tot_rate","kat_rate"))

other_kat_input_df$count%>%max()

tot_mut_count_ref_count_df%>%
  filter(grepl("C>",mut_sig_cont))%>%
  mutate(group=ifelse(grepl("TC>",mut_sig_cont),"TCN","nonTCN"))%>%
  group_by(group)%>%
  dplyr::summarise(tot_ref_cont_count_sum=sum(tot_ref_cont_count),
                   kat_out_ref_cont_count_sum=sum(kat_out_ref_cont_count),
                   tot_mut_count_sum=sum(tot_mut_count),
                   kat_out_mut_count_sum=sum(kat_out_mut_count),
                   kat_mut_count_sum=sum(kat_mut_count),
                   kat_Ref_cont_count_sm=sum(kat_ref_cont_count))
other_kat_input_df_v2<-tot_mut_count_ref_count_df%>%mutate(kat_mut_exp_count=kat_out_mut_count*kat_ref_cont_count/kat_out_ref_cont_count)%>%
  select(mut_sig_cont,kat_mut_count,kat_mut_exp_count)%>%
  gather(region,count,kat_mut_count,kat_mut_exp_count)%>%
  mutate(type_3=rep(c(1:96),2))%>%
  mutate(Substitution=substr(mut_sig_cont,2,4),
         Trinucleotide=paste0(substr(mut_sig_cont,1,2),substr(mut_sig_cont,5,5)))
other_kat_input_df_v2$region<-factor(other_kat_input_df_v2$region,levels=c("kat_mut_exp_count","kat_mut_count"))

h1<-ggplot(other_kat_input_df_v2)+
  #geom_hline(yintercept=seq(0,150,by=50),
  #           linetype="dotted",col="grey")+
  geom_rect(aes(xmin = as.numeric(type_3) - 0.5, xmax = as.numeric(type_3) + 0.5, ymin = 0, ymax = 200,  fill = Substitution), alpha = 0.06)+
  geom_bar(mapping=aes(x=type_3,y=count,fill=region),
           stat="identity",
           position="dodge",
           width=0.5)+
  #  guides(fill = guide_legend(ncol = 2))+
  theme(#axis.text.x.bottom = element_blank(),
    #axis.ticks.x = element_blank(),
    axis.ticks.y=element_line(size=1,colour="grey"),
    axis.ticks.length.y=unit(-0.25,"cm"),
    panel.spacing.x = unit(0, "mm"),
    axis.title.x = element_blank(),
    strip.background.x = element_blank(),
    strip.text.x = element_blank())+
  #facet_grid(.~Substitution, scales = "free_x")+
  theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1,size=20,family="Consolas"),
        axis.text.y=element_text(size=20),
        axis.title.y=element_text(size=20)
  )+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill=NA, size=1),
  )+
  scale_x_continuous(
    breaks = c(1:96),
    labels = contextorder96,
    expand = c(0.01, 0)
  )+
  #scale_y_log10()+
  scale_y_continuous(
    limits=c(0,200),
    expand=c(0,0),
    trans=scales::pseudo_log_trans(base = 10,sigma = 0.01),
    breaks=c(0,0.01,0.1,1,10,100,200),
    labels=c(0,0.01,0.1,1,10,100,200)
  )+
  xlab("")+ylab("")+
  #scale_fill_manual(values=col_c6)+
  theme(legend.position = "none")+
  scale_fill_manual(values=c(palette.COSMIC.SNV.96[1:3],col_c6[1:2],palette.COSMIC.SNV.96[4:6]))+
  scale_colour_manual(values=c(col_c6))+
  
  ylab("# of SNVs")
h1
h2<-ggplot(other_kat_input_df_v2)+
  geom_bar(mapping = aes(x = type_3, y = 1, fill = Substitution),
           stat = "identity",
           width = 1)+
  theme_void()+
  theme(panel.spacing.x = unit(0, "mm"),)+
  facet_grid(.~Substitution, scales = "free_x")+
  scale_fill_manual(values=palette.COSMIC.SNV.96)+
  theme(legend.position = "none")+
  theme(strip.text.x=element_text(size=40))

h2

legend <- plot_grid(get_legend(h2), get_legend(h1), ncol = 1)
h1 <- h1 + theme(legend.position = "none")
h2 <- h2 + theme(legend.position = "none")

other_kat_rate_plot_v2<-plot_grid(h2, h1, align = "v", ncol = 1, axis = "tblr", rel_heights = c(0.5, 5))
other_kat_rate_plot_v2

save_plot("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/other_kat_rate_96.v3.pdf",other_kat_rate_plot_v2,
          ncol=1,
          nrow=2,
          base_asp=7,
          #unit="px",
          
          device=cairo_pdf)


other_kat_input_df_v2%>%
  filter(grepl("C>",Substitution))%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide),"TCN","nonTCN"))%>%
  group_by(region,Substitution,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(region,count_sum)%>%
  mutate(count_rate=kat_mut_count/kat_mut_exp_count)

other_kat_input_df_v2_cont_rate<-
other_kat_input_df_v2%>%
  #filter(grepl("C>",Substitution))%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide),"TCN",
                          ifelse(grepl(".T.",Trinucleotide),"NTN","nonTCN")))%>%
  group_by(region,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(region,count_sum)%>%
  mutate(count_rate=kat_mut_count/kat_mut_exp_count)

other_kat_input_df_v2_cont_rate$cont_type<-factor(other_kat_input_df_v2_cont_rate$cont_type,levels=c("TCN","nonTCN","NTN"))
p_cont_rate<-other_kat_input_df_v2_cont_rate%>%
  filter(cont_type%in%c("TCN","nonTCN"))%>%
  ggplot(aes(x=cont_type,y=count_rate,col="black"))+
  geom_bar(stat="identity")+
  theme_classic()+
  theme(legend.position="none")+
  ylim(0,600)
p_cont_rate
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/other_kat.cont_rate.pdf",p_cont_rate,
         width=8,height=10)


other_kat_input_df_v2%>%
  filter(grepl("C>",Substitution))%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide),"TCN","nonTCN"))%>%
  group_by(region,Substitution,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(region,count_sum)%>%
  mutate(count_rate=kat_mut_count/kat_mut_exp_count)


p_other_rate<-other_kat_input_df_v2%>%
  filter(grepl("C>",Substitution))%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide),"TCN","nonTCN"))%>%
  group_by(region,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(cont_type,count_sum)%>%
  mutate(rate=TCN/nonTCN)%>%
  ggplot(aes(x=region,y=rate,colour="black"))+
  geom_bar(stat="identity")+
  theme_classic()+
  theme(legend.position="none")+
  ylim(c(0,15))

ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/other_kat.rate.pdf",p_other_rate,
       width=8,height=10)

other_kat_input_df_v2%>%
  filter(grepl("C>",Substitution))%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide),"TCN","nonTCN"))%>%
  group_by(region,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  ggplot(aes(x=region,y=count_sum,fill=cont_type))+
  geom_bar(stat="identity",position="dodge")+
#  facet_wrap(~region,scales="free")+
  theme_classic()

other_kat_input_df_v2%>%
  filter(grepl("C>",Substitution))%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide),"TCN","nonTCN"))%>%
  group_by(region,Substitution,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(cont_type,count_sum)%>%
  mutate(rate=TCN/nonTCN)

other_kat_input_df_v2%>%
  filter(grepl("C>",Substitution))%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide),"TCN","nonTCN"))%>%
  group_by(region,Substitution,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(cont_type,count_sum)%>%
  mutate(rate=TCN/nonTCN)%>%
  ggplot(aes(x=Substitution,y=rate,fill=region))+
  geom_bar(stat="identity",position="dodge")
#  mutate(count_rate=kat_mut_count/kat_mut_exp_count)



other_kat_input_df_v2%>%
  filter(grepl("C>",Substitution))%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide),"TCN","nonTCN"))%>%
  #  group_by(region,Substitution,cont_type)%>%
  spread(region,count)%>%
  filter(kat_mut_count>0)%>%
  gather(region,count,6:7)%>%
  group_by(region,Substitution,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(region,count_sum)%>%
  mutate(count_rate=kat_mut_count/kat_mut_exp_count)

other_kat_input_df_v2%>%
  filter(grepl("C>",Substitution))%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide),"TCN","nonTCN"))%>%
  #  group_by(region,Substitution,cont_type)%>%
  spread(region,count)%>%
  filter(kat_mut_count>0)%>%
  gather(region,count,6:7)%>%
  group_by(region,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(cont_type,count_sum)%>%
  mutate(rate=TCN/nonTCN)

other_kat_input_df_v2%>%
  filter(grepl("C>",Substitution))%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide),"TCN","nonTCN"))%>%
  #  group_by(region,Substitution,cont_type)%>%
  spread(region,count)%>%
  filter(kat_mut_count>0)%>%
  gather(region,count,6:7)%>%
  group_by(region,Substitution,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(cont_type,count_sum)%>%
  mutate(rate=TCN/nonTCN)

group_by(region,cont_type)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(cont_type,count_sum)%>%
  mutate(rate=TCN/nonTCN)

h1<-ggplot(other_kat_input_df)+
  #geom_hline(yintercept=seq(0,150,by=50),
  #           linetype="dotted",col="grey")+
  geom_rect(aes(xmin = as.numeric(type_3) - 0.5, xmax = as.numeric(type_3) + 0.5, ymin = 0, ymax = 35000,  fill = Substitution), alpha = 0.06)+
  geom_bar(mapping=aes(x=type_3,y=count,fill=region),
           stat="identity",
           position="dodge",
           width=0.5)+
  #  guides(fill = guide_legend(ncol = 2))+
  theme(#axis.text.x.bottom = element_blank(),
    #axis.ticks.x = element_blank(),
    axis.ticks.y=element_line(size=1,colour="grey"),
    axis.ticks.length.y=unit(-0.25,"cm"),
    panel.spacing.x = unit(0, "mm"),
    axis.title.x = element_blank(),
    strip.background.x = element_blank(),
    strip.text.x = element_blank())+
  #facet_grid(.~Substitution, scales = "free_x")+
  theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1,size=20,family="Consolas"),
        axis.text.y=element_text(size=20),
        axis.title.y=element_text(size=20)
  )+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill=NA, size=1),
  )+
  scale_x_continuous(
    breaks = c(1:96),
    labels = contextorder96,
    expand = c(0.01, 0)
  )+
  #scale_y_log10()+
  scale_y_continuous(
    limits=c(0,35000),
    expand=c(0,0),
    trans=scales::pseudo_log_trans(base = 10,sigma = 0.1),
    breaks=c(0,1,10,100,1000,10000,30000),
    labels=c(0,1,10,100,1000,10000,30000)
  )+
  xlab("")+ylab("")+
  #scale_fill_manual(values=col_c6)+
  #theme(legend.position = "none")+
  scale_fill_manual(values=c(palette.COSMIC.SNV.96[1:3],col_c6[1],palette.COSMIC.SNV.96[4:6],col_c6[2]))+
  scale_colour_manual(values=c(col_c6))+
  
  ylab("mutation rate / 1Mb")

#  scale_colour_manual(values=c(palette.COSMIC.SNV.96,col_c6))

h1
h2<-ggplot(other_kat_input_df)+
  geom_bar(mapping = aes(x = type_3, y = 1, fill = Substitution),
           stat = "identity",
           width = 1)+
  theme_void()+
  theme(panel.spacing.x = unit(0, "mm"),)+
  facet_grid(.~Substitution, scales = "free_x")+
  scale_fill_manual(values=palette.COSMIC.SNV.96)+
  theme(legend.position = "none")+
  theme(strip.text.x=element_text(size=40))

h2

legend <- plot_grid(get_legend(h2), get_legend(h1), ncol = 1)
h1 <- h1 + theme(legend.position = "none")
h2 <- h2 + theme(legend.position = "none")

other_kat_rate_plot<-plot_grid(h2, h1, align = "v", ncol = 1, axis = "tblr", rel_heights = c(0.5, 5))
other_kat_rate_plot

save_plot("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/other_kat_rate_96.pdf",other_kat_rate_plot,
          ncol=1,
          nrow=2,
          base_asp=7,
          #unit="px",
          
          device=cairo_pdf)




fisher_tmp<-lapply(tot_mut_count_ref_count_df$mut_sig_cont,function(x){
  df<-tot_mut_count_ref_count_df%>%filter(mut_sig_cont==x)
  df2<-data.frame(pval=(fisher.test(rbind(c(df$kat_mut_count,df$kat_ref_cont_count),c(df$kat_out_mut_count,(df$kat_out_ref_cont_count)*id_count))))$p.value,sig_cont=x)%>%as.tibble()
})
  
fisher_df<-do.call(rbind,fisher_tmp)  
c_pval<-fisher_df$pval

pval_df<-p.adjust(c_pval, method = "BH", n = length(c_pval))%>%as.tibble()%>%cbind(sig_cont=contextorder96)
pval_df%>%
write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/other_kat_fisher_pval.txt",
            sep="\t",
            quote=F,
            row.names=F)

VCN_df%>%filter(!info%in%excl_info)%>%arrange(-tot_VCN)
kat_count_sum_df
prop.table(c(416331,68824492689))
kat_count_sum_df%>%
  dplyr::summarise(tot_count_sum=sum(count_sum),
                   out_count_sum=sum(out_count_sum))
kat_count_sum_df<-left_join(kat_count_sum_df%>%mutate(id=gsub("_[0-9]*_[0-9]*$","",info)),merge_df%>%mutate(VCN=ifelse(grepl("[ACG]C>",sig_cont),1,0))%>%group_by(id)%>%dplyr::summarise(VCN_count=sum(VCN)))
#fisher.test(rbind(c(416331,68824492689),c(4103,655471221)))
#?chisq.
fisher.test()
(kat_count_sum_df%>%filter(info%in%(VCN_df%>%filter(!info%in%excl_info))$info))$info
x<-"A3A_1st_C3_100ng-1_1_2"
kat_count_sum_df<-left_join(
  kat_count_sum_df,
  index_df%>%mutate(VCN=ifelse(grepl("[AGC]C>",sig_cont),1,0))%>%
    group_by(info)%>%
    dplyr::summarise(tot_VCN=sum(VCN))
)
kat_count_sum_df
fisher_tmp<-lapply((kat_count_sum_df%>%filter(info%in%(VCN_df$info)))$info,function(x){
  df<-kat_count_sum_df%>%filter(info==x)
  #df2<-data.frame(pval=(chisq.test(c(1,df$count_sum),simulate.p.value=TRUE,B=10000,p=prop.table(c(df$VCN_count,df$out_count_sum))))$p.value,info=x)%>%as.tibble()
  df2<-data.frame(pval=(fisher.test(rbind(c(df$tot_VCN,df$count_sum),c(df$VCN_count-df$tot_VCN,df$out_count_sum))))$p.value,info=x)%>%as.tibble()
  #df2<-data.frame(pval=fisher.test(rbind(c(1,df$count_sum),c(df$VCN_count,df$out_count_sum)))$p.value,info=x)%>%as.tibble()
}
)
fisher_df<-do.call(rbind,fisher_tmp)
chi_df
index_df%>%filter(info=="A3A_C3_TP53_C3_100ng-3_10_2")
fisher_df%>%
  filter(info%in%VCN_df$info)%>%
  arrange(-pval)

c_pval<-(fisher_df%>%
           filter(info%in%VCN_df$info))$pval


log10(c_pval)
c_pval
library(ggforce)
c_pval%>%max()





count_ref_df<-data.frame(group=c(1:8))%>%as.tibble()


p_count_df<-left_join(count_ref_df,p.adjust(c_pval, method = "BH", n = length(c_pval))%>%as.tibble()%>%
                        mutate(group=ifelse(value>=0.01,1,
                                            ifelse(value>=0.001,2,
                                                   ifelse(value>=0.0001,3,
                                                          ifelse(value>=0.00001,4,
                                                                 ifelse(value>=0.000001,5,
                                                                        ifelse(value>=0.0000001,6,
                                                                               ifelse(value>=0.00000001,7,
                                                                                      ifelse(value>=0.000000001,8,
                                                                                             9)))))))))%>%
                        group_by(group)%>%
                        dplyr::summarise(n=n())
)


p_count_df[is.na(p_count_df)]<-0
p_count_df$group<-factor(p_count_df$group,levels=c(1:8))
p_mix_bar<-p_count_df%>%
  ggplot(aes(x=group,y=n,col="black"))+
  geom_bar(stat="identity")+
  guides(colour="none")+
  theme_classic()
p_mix_bar
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/p_dist_mix_kat.bar.pdf",p_mix_bar,
       height=8,width=10)  


p_mix<-p.adjust(c_pval, method = "BH", n = length(c_pval))%>%as.tibble()%>%
  ggplot(aes(x=value))+
  geom_histogram()+
  scale_x_continuous(trans = trans_reverser('log10'),
                     breaks=c(0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.05),
                     lim=c(0.05,0.0000001)
  )+
  theme_bw()
p_mix
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/p_dist_VCN_kat.pdf",p_mix,
       height=8,width=10)  

#######mixed-kataegis######
###########################

mix_c<-c("A3A_C3_TP53_C3_3ug-3_1_5",
         "A3A_C3_3ug-5_5_2",
         "A3A_1st_C3_3ug-2_2_7",
         "A3A_1st_C3_3ug-2_2_17",
         "A3A_C3_3ug-5_22_5"
)

mix_c

