library(tidyverse)
library(ggplot2)
library(dplyr)
metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")


cl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/v2/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")


files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
                          ".*excl_common.gd$",
                          full.names=T)

vcf_tmp<-lapply(files_to_read,function(x){
  read_tsv(x,show_col_types = FALSE)%>%mutate(id=gsub(".mutect2.*","",basename(x)))%>%
    #select(id,`#CHROM`,POS,REF,ALT,sig_cont)
    select(`#CHROM`,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sig_cont,id)
  
})

merge_df<-do.call(rbind,vcf_tmp)
merge_df<-merge_df%>%filter(grepl("[ATGC][ATGC]>[ATGC][ATGC]",sig_cont))

merge_df<-left_join(merge_df,metadata%>%select(-`m/d`))


merge_df<-left_join(merge_df,cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)%>%plyr::rename(c("chr"="#CHROM","start"="POS","samples"="id")))
merge_df<-merge_df%>%mutate(IMD=ifelse(is.na(IMD),0,IMD),cluster_id=ifelse(is.na(cluster_id),".",cluster_id),cluster=ifelse(is.na(cluster_type_omikli_upto_3),"non-clust",cluster_type_omikli_upto_3))%>%select(-cluster_type_omikli_upto_3)

A3A_cl_merge_df<-merge_df%>%filter(dose%in%c("3ug","100ng"))%>%filter(APOBEC=="A3A")

A3A_cl_merge_df<-A3A_cl_merge_df%>%mutate(APOBEC="A3A")
A3A_cl_merge_f_df<-left_join(
  A3A_cl_merge_df,
  A3A_cl_merge_df%>%group_by(id,`#CHROM`,APOBEC,dose,TP53,cluster_id,cluster)%>%dplyr::summarise(AMS=sum(grepl("TC>[GT][AT]",sig_cont)))%>%filter(cluster!="non-clust")
)%>%filter(AMS>=2)

A3A_cl_merge_f_df<-left_join(A3A_cl_merge_f_df,A3A_cl_merge_f_df%>%group_by(id,`#CHROM`,cluster_id,cluster)%>%dplyr::summarise(no_snv=n()))

excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/exclude_cl_mut_df.v2.txt")

overlap_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/overlapped_cluster.v2.edit.txt")%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))

miss_phs_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/miss_phased_cluster.txt")%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))



A3A_cl_merge_f_df<-A3A_cl_merge_f_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))
excl_df%>%filter(grepl("A3A",id))
A3A_cl_merge_f_excl_df<-A3A_cl_merge_f_df%>%filter(!info%in%excl_df$info)%>%filter(!info%in%overlap_excl_df$info)%>%filter(!info%in%miss_phs_excl_df$info)


mix_c<-c("A3A_C3_TP53_C3_3ug-2_16_2",
         "A3A_C3_3ug-5_5_2",
         "A3A_1st_C3_3ug-2_2_7",
         "A3A_1st_C3_3ug-2_2_17",
         "A3A_C3_3ug-5_22_5"
)

mix_kat_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/32_cluster_signature/00_original_files/A3A_clsuter_SNV.kat.mix_strand.vcf")


A3A_cl_merge_f_excl_count_df<-A3A_cl_merge_f_excl_df%>%group_by(id,`#CHROM`,cluster_id,info,cluster,REF)%>%
  dplyr::summarise(n=n())%>%mutate(REF=paste0("std_",REF,"_mut"))%>%
  spread(REF,n)


A3A_cl_merge_f_excl_count_df[is.na(A3A_cl_merge_f_excl_count_df)]<-0


A3A_cl_merge_f_excl_count_df<-A3A_cl_merge_f_excl_count_df%>%mutate(main_ref=ifelse(std_C_mut>std_G_mut,"C",
                                                ifelse(std_C_mut<std_G_mut,"G","C")))%>%
  arrange(-(std_A_mut+std_T_mut))%>%
  mutate(main_ref=ifelse(std_C_mut>std_G_mut,"C",
                         ifelse(std_C_mut<std_G_mut,"G","C")))  


A3A_cl_merge_f_excl_ref_df<-left_join(A3A_cl_merge_f_excl_df,A3A_cl_merge_f_excl_count_df)

cut_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/v2/imds/APOBEC_clustered_mutations_imd_cutoff.tsv")%>%
  plyr::rename(c("Sample"="id"))

A3A_cl_merge_f_excl_ref_df<-left_join(A3A_cl_merge_f_excl_ref_df,A3A_cl_merge_f_excl_ref_df%>%
  group_by(info)%>%
  dplyr::summarise(min_pos=min(POS),
                  max_pos=max(POS))
)



kat_cont_count_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/kat_cont_count_df.v2.txt")


##making count_ref##
####################



contextorder96 <- paste0(
  rep(rep(c("A","C","G","T"),each=4),4), #firstbase
  rep(c("C","T"),each=48),
  ">",
  rep(c("A","G","T","A","C","G"), each=16),
  rep(c("A","C","G","T"),16) # lastbase
)
contextorder96[33]
c(paste0(
  chartr("ACGT","TGCA",substr(contextorder96,5,5)),
  chartr("ACGT","TGCA",substr(contextorder96,2,2)),
  ">",
  chartr("ACGT","TGCA",substr(contextorder96,4,4)),
  chartr("ACGT","TGCA",substr(contextorder96,1,1))
))[33]

contextorder192 <- c(contextorder96,
                     c(paste0(
                       chartr("ACGT","TGCA",substr(contextorder96,5,5)),
                       chartr("ACGT","TGCA",substr(contextorder96,2,2)),
                       ">",
                       chartr("ACGT","TGCA",substr(contextorder96,4,4)),
                       chartr("ACGT","TGCA",substr(contextorder96,1,1))
                     )
                     )                     
                     )



ref_64_df<-data.frame(mut_sig_cont=contextorder192)%>%as.tibble()%>%mutate(sig_cont=paste0(substr(mut_sig_cont,1,2),substr(mut_sig_cont,5,5)))

ref_64_id_df<-data.frame(
sig_cont=rep(ref_64_df$sig_cont,106),
mut_sig_cont=rep(ref_64_df$mut_sig_cont,106),
id=rep((A3A_cl_merge_f_excl_ref_df%>%filter(cluster=="kataegis")%>%
  select(id,info)%>%unique())$id,each=192),

info=rep((A3A_cl_merge_f_excl_ref_df%>%filter(cluster=="kataegis")%>%
       select(id,info)%>%unique())$info,each=192)
)%>%
  as.tibble()%>%
  mutate(REF=substr(sig_cont,2,2))


##kataegis_ref_cont###
############################
mix_kat_cont_count_sum_df_bak<-mix_kat_cont_count_sum_df


mix_kat_cont_count_sum_df<-left_join(A3A_cl_merge_f_excl_ref_df%>%filter(cluster=="kataegis")%>%select(id,info,main_ref)%>%unique(),kat_cont_count_df)%>%
  filter((main_ref=="C"&grepl("^.[GA].",sig_cont))|(main_ref=="G"&grepl("^.[CT].",sig_cont)))%>%
  group_by(id,info,sig_cont)%>%
  dplyr::summarise(n=sum(value))

df2<-left_join(ref_64_id_df,mix_kat_cont_count_sum_df)%>%
  plyr::rename(c("n"="mix_kat_ref_cont_count"))
df2[is.na(df2)]<-0
###reference_cont_total##
#########################
ref_cont_count_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/ref_cont_count_df.v2.txt")%>%
  group_by(sig_cont)%>%
  dplyr::summarise(mix_kat_ref_count=sum(value))


mix_kat_count<-A3A_cl_merge_f_excl_ref_df%>%filter(cluster=="kataegis")%>%select(id,info,main_ref)%>%unique()%>%
  group_by(main_ref)%>%dplyr::summarise(n=n())


#mix_kat_count
#main_ref     n
#C           51
#G           55

sub_df3_1<-cbind(
A3A_cl_merge_f_excl_ref_df%>%filter(cluster=="kataegis")%>%select(id,info,main_ref)%>%unique()%>%filter(main_ref=="C")%>% dplyr::slice(rep(1:n(), each = 32)),
ref_cont_count_df%>%filter(substr(sig_cont,2,2)%in%c("G","A"))%>%dplyr::slice(rep(1:n(),51))
)

sub_df3_2<-cbind(
  A3A_cl_merge_f_excl_ref_df%>%filter(cluster=="kataegis")%>%select(id,info,main_ref)%>%unique()%>%filter(main_ref=="G")%>% dplyr::slice(rep(1:n(), each = 32)),
  ref_cont_count_df%>%filter(substr(sig_cont,2,2)%in%c("C","T"))%>%dplyr::slice(rep(1:n(),55))
)

merge_sub_df3<-rbind(
sub_df3_1,
sub_df3_2
)%>%as.tibble()


df3<-left_join(ref_64_id_df,merge_sub_df3)%>%select(-main_ref)
df3[is.na(df3)]<-0
df3<-df3%>%plyr::rename(c("mix_kat_ref_count"="mix_tot_ref_cont_count"))


library(stringi)
library(stringr)
##mix_kat_cont###
#################

sub_df1<-left_join(ref_64_id_df%>%filter(grepl(".[GA].",sig_cont)),
  A3A_cl_merge_f_excl_ref_df%>%filter(cluster=="kataegis")%>%
    filter(main_ref=="C")%>%
    filter(REF%in%c("G","A"))%>%
    group_by(id,info,REF,sig_cont)%>%
    dplyr::summarise(n=n())%>%
    plyr::rename(c("sig_cont"="mut_sig_cont"))%>%
    mutate(sig_cont=paste0(substr(mut_sig_cont,1,2),substr(mut_sig_cont,5,5)))%>%
    mutate(sig_cont=stri_reverse(chartr("ACGT","TGCA",sig_cont)))%>%
    mutate(mut_sig_cont=paste0(stri_reverse(chartr("ACGT","TGCA",substr(mut_sig_cont,5,5))),
                               stri_reverse(chartr("ACGT","TGCA",substr(mut_sig_cont,2,2))),
                               ">",
                               stri_reverse(chartr("ACGT","TGCA",substr(mut_sig_cont,4,4))),
                               stri_reverse(chartr("ACGT","TGCA",substr(mut_sig_cont,1,1)))))%>%ungroup()
)

sub_df1[is.na(sub_df1)]<-0



sub_df2<-left_join(ref_64_id_df%>%filter(grepl(".[CT].",sig_cont)),
A3A_cl_merge_f_excl_ref_df%>%filter(cluster=="kataegis")%>%
  filter(main_ref=="G")%>%
  filter(REF%in%c("C","T"))%>%
  group_by(id,info,REF,sig_cont)%>%
  dplyr::summarise(n=n())%>%
  plyr::rename(c("sig_cont"="mut_sig_cont"))%>%
  mutate(sig_cont=paste0(substr(mut_sig_cont,1,2),substr(mut_sig_cont,5,5)))%>%
  ungroup()
  
)


sub_df2[is.na(sub_df2)]<-0

df1<-rbind(
sub_df1,
sub_df2
)
df1<-df1%>%plyr::rename(c("n"="mix_kat_mut_cont"))
##total_snv_count###
################3###

df4<-left_join(ref_64_id_df,
merge_df%>%
  mutate(mut_sig_cont=ifelse(REF%in%c("C","T"),sig_cont,paste0(stri_reverse(chartr("ACGT","TGCA",substr(sig_cont,5,5))),
                                                               stri_reverse(chartr("ACGT","TGCA",substr(sig_cont,2,2))),
                                                               ">",
                                                               stri_reverse(chartr("ACGT","TGCA",substr(sig_cont,4,4))),
                                                               stri_reverse(chartr("ACGT","TGCA",substr(sig_cont,1,1))))))%>%
  group_by(id,REF,mut_sig_cont)%>%
  dplyr::summarise(n=n())%>%
  filter(id%in%A3A_cl_merge_f_excl_ref_df$id)%>%ungroup()%>%
  select(-REF)
)%>%
  plyr::rename(c("n"="tot_mut_count"))

df4[is.na(df4)]<-0


##merge

mix_kat_count_df<-left_join(df1,df2)%>%
  left_join(df3)%>%
  mutate(mix_kat_out_ref_cont_count=mix_tot_ref_cont_count-mix_kat_ref_cont_count)%>%
  left_join(df4)%>%
  mutate(mix_kat_out_mut_count=tot_mut_count-mix_kat_mut_cont)%>%
  
  #select(info,REF,mix_tot_ref_cont_count,mut_sig_cont)%>%unique()%>%
  filter(mix_tot_ref_cont_count!=0)
  
mix_kat_count_sum_df<-mix_kat_count_df%>%
  group_by(REF,sig_cont,mut_sig_cont)%>%
  dplyr::summarise(mix_kat_mut_cont_sum=sum(mix_kat_mut_cont),
                   mix_kat_ref_cont_count_sum=sum(mix_kat_ref_cont_count),
                   mix_tot_ref_cont_count_sum=sum(mix_tot_ref_cont_count),
                   mix_kat_out_ref_cont_count_sum=sum(mix_kat_out_ref_cont_count),
                   tot_mut_count_sum=sum(tot_mut_count),
                   mix_kat_out_mut_count_sum=sum(mix_kat_out_mut_count))%>%
  mutate(mix_kat_rate=mix_kat_mut_cont_sum/mix_kat_ref_cont_count_sum,
         mix_kat_out_rate=mix_kat_out_mut_count_sum/mix_kat_out_ref_cont_count_sum)

mix_kat_count_sum_df$mut_sig_cont<-factor(mix_kat_count_sum_df$mut_sig_cont,levels=contextorder192)



mix_kat_count_sum_input_df<-mix_kat_count_sum_df%>%select(mut_sig_cont,mix_kat_rate,mix_kat_out_rate)%>%
  gather(region,rate,mix_kat_rate:mix_kat_out_rate)%>%
  mutate(main_ref=ifelse(REF%in%c("C","T"),"watson","crick"))%>%
  plyr::rename(c("sig_cont"="Trinucleotide"))%>%
  mutate(Substitution=substr(mut_sig_cont,2,4))%>%ungroup()%>%
  mutate(type_3=rep(c(1:96),4))%>%
  mutate(rate=rate*1000000)%>%
  plyr::rename(c("rate"="count"))

mix_kat_count_sum_input_df<-mix_kat_count_sum_input_df%>%arrange(region)
mix_kat_count_sum_input_df$mut_sig_cont<-factor(mix_kat_count_sum_input_df$mut_sig_cont,levels=contextorder192)  
mix_kat_count_sum_input_df$Substitution<-factor(mix_kat_count_sum_input_df$Substitution,levels=c("C>A","C>G","C>T","T>A","T>C","T>G","G>T","G>C","G>A","A>T","A>G","A>C"))
mix_kat_count_sum_input_df<-plyr::ddply(mix_kat_count_sum_input_df, c('region', 'main_ref','Substitution','mut_sig_cont'))%>%as.tibble()%>%
  mutate(type_3=rep(c(1:96),4))


mix_kat_count_df%>%
  group_by(REF,sig_cont,mut_sig_cont)%>%
  dplyr::summarise(mix_kat_mut_cont_sum=sum(mix_kat_mut_cont),
                   mix_kat_ref_cont_count_sum=sum(mix_kat_ref_cont_count),
                   mix_tot_ref_cont_count_sum=sum(mix_tot_ref_cont_count),
                   mix_kat_out_ref_cont_count_sum=sum(mix_kat_out_ref_cont_count),
                   tot_mut_count_sum=sum(tot_mut_count),
                   mix_kat_out_mut_count_sum=sum(mix_kat_out_mut_count))%>%
  mutate(mix_kat_mut_exp_cont_sum=mix_kat_out_mut_count_sum/mix_kat_out_ref_cont_count_sum*mix_kat_ref_cont_count_sum)%>%
  mutate(main_ref=ifelse(REF%in%c("C","T"),"watson","crick"))%>%
  ungroup()%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/mix_kat_sum_table.txt",
              quote=F,
              row.name=F,
              sep="\t")


mix_kat_count_sum_input_df_v2<-mix_kat_count_df%>%
  group_by(REF,sig_cont,mut_sig_cont)%>%
  dplyr::summarise(mix_kat_mut_cont_sum=sum(mix_kat_mut_cont),
                   mix_kat_ref_cont_count_sum=sum(mix_kat_ref_cont_count),
                   mix_tot_ref_cont_count_sum=sum(mix_tot_ref_cont_count),
                   mix_kat_out_ref_cont_count_sum=sum(mix_kat_out_ref_cont_count),
                   tot_mut_count_sum=sum(tot_mut_count),
                   mix_kat_out_mut_count_sum=sum(mix_kat_out_mut_count))%>%
  mutate(mix_kat_mut_exp_cont_sum=mix_kat_out_mut_count_sum/mix_kat_out_ref_cont_count_sum*mix_kat_ref_cont_count_sum)%>%
  mutate(main_ref=ifelse(REF%in%c("C","T"),"watson","crick"))%>%
  ungroup()%>%
  select(main_ref,sig_cont,mut_sig_cont,mix_kat_mut_cont_sum,mix_kat_mut_exp_cont_sum)%>%
  gather("region","count",mix_kat_mut_cont_sum:mix_kat_mut_exp_cont_sum)%>%
  mutate(Substitution=substr(mut_sig_cont,2,4))%>%ungroup()%>%
  plyr::rename(c("sig_cont"="Trinucleotide"))%>%
  mutate(type_3=rep(c(1:96),4))
mix_kat_count_sum_input_df_v2<-mix_kat_count_sum_input_df_v2%>%arrange(region)
mix_kat_count_sum_input_df_v2$mut_sig_cont<-factor(mix_kat_count_sum_input_df_v2$mut_sig_cont,levels=contextorder192)
mix_kat_count_sum_input_df_v2$Substitution<-factor(mix_kat_count_sum_input_df_v2$Substitution,levels=c("C>A","C>G","C>T","T>A","T>C","T>G","G>T","G>C","G>A","A>T","A>G","A>C"))
mix_kat_count_sum_input_df_v2<-plyr::ddply(mix_kat_count_sum_input_df_v2, c('region', 'main_ref','Substitution','mut_sig_cont'))%>%as.tibble()%>%
  mutate(type_3=rep(c(1:96),4))


mix_kat_count_sum_input_df_v2$region<-factor(mix_kat_count_sum_input_df_v2$region,levels=c("mix_kat_mut_exp_cont_sum","mix_kat_mut_cont_sum"))
p_mix_rate<-mix_kat_count_sum_input_df_v2%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide)|grepl("GA",Trinucleotide),"TCN","nonTCN"))%>%
  group_by(cont_type,region)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  filter(cont_type=="TCN")%>%
  ggplot(aes(x=region,y=count_sum,col="black"))+
  geom_bar(stat="identity")+
  theme_classic()+
  ylim(c(0,15))+
  ylab("# of SNVs")

ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/mix_kat_rate.pdf",p_mix_rate,
         width=10,height=8)

mix_kat_count_sum_input_df_v3<-mix_kat_count_sum_input_df_v2%>%
  mutate(cont_type=ifelse(grepl("^TC",Trinucleotide)|grepl("GA$",Trinucleotide),"TCN",
                          ifelse(grepl(".[AT].",Trinucleotide),"NTN","nonTCN")))%>%
  #filter(cont_type%in%c("nonTCN","TCN"))%>%
  group_by(cont_type,region)%>%
  dplyr::summarise(count_sum=sum(count))%>%
  spread(region,count_sum)%>%
  mutate(oe_ratio=mix_kat_mut_cont_sum/mix_kat_mut_exp_cont_sum)
  #filter(cont_type%in%c("TCN","nonTCN"))

mix_kat_count_sum_input_df_v3$cont_type<-factor(mix_kat_count_sum_input_df_v3$cont_type,levels=c("TCN","nonTCN","NTN"))
p_mix_kat_cont_rate<-mix_kat_count_sum_input_df_v3%>%
  filter(cont_type%in%c("TCN","nonTCN"))%>%
  ggplot(aes(x=cont_type,y=oe_ratio,col="black"))+
  geom_bar(stat="identity")+
  theme_classic()+
  ylim(c(0,600))+
  #ylim(c(0,15))+
  ylab("# of SNVs")

ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/mix_kat.cont_rate.pdf",p_mix_kat_cont_rate,
       width=8,height=10)
  
  
  



h1<-mix_kat_count_sum_input_df_v2%>%
  #t_df%>%arrange(region)
  filter(main_ref=="watson")%>%
  ggplot()+
  #geom_hline(yintercept=seq(0,150,by=50),
  #           linetype="dotted",col="grey")+
  geom_rect(aes(xmin = as.numeric(type_3) - 0.5, xmax = as.numeric(type_3) + 0.5, ymin = 0, ymax = 10,  fill = Substitution),alpha=0.06)+
  geom_bar(mapping=aes(x=type_3,y=count,fill=region),
           stat="identity",
           position="dodge",
           width=0.5)+
  #  guides(fill = guide_legend(ncol = 2))+
  theme(#axis.text.x.bottom = element_blank(),
    #axis.ticks.x = element_blank(),
    axis.ticks.y=element_line(size=1,colour="grey"),
    axis.ticks.length.y=unit(-0.25,"cm"),
    panel.spacing.x = unit(0, "mm"),
    axis.title.x = element_blank(),
    strip.background.x = element_blank(),
    strip.text.x = element_blank())+
  #facet_grid(.~Substitution, scales = "free_x")+
  theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1,size=20,family="Consolas"),
        axis.text.y=element_text(size=20),
        axis.title.y=element_text(size=20)
  )+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill=NA, size=1),
  )+
  scale_x_continuous(
    breaks = c(1:96),
    labels = contextorder96,
    expand = c(0.01, 0)
  )+
  #scale_y_log10()+
  scale_y_continuous(
    limits=c(0,10),
    expand=c(0,0),
    trans=scales::pseudo_log_trans(base = 10,sigma = 0.001),
    breaks=c(0,0.001,0.01,0.1,1,10),
    labels=c(0,0.001,0.01,0.1,1,10)
  )+
  xlab("")+ylab("")+
  #theme(legend.position = "none")+
  scale_fill_manual(values=c(palette.COSMIC.SNV.192[1:3],col_c6[1:2],palette.COSMIC.SNV.192[4:6]))+
  scale_colour_manual(values=c(col_c6))+
  
  ylab("# of SNVs")
h1
#  scale_colour_manual(values=c(palette.COSMIC.SNV.96,col_c6))

h1
h2<-ggplot(mix_kat_count_sum_input_df_v2%>%
             #t_df%>%arrange(region)
             filter(main_ref=="watson"))+
  geom_bar(mapping = aes(x = type_3, y = 1, fill = Substitution),
           stat = "identity",
           width = 1)+
  theme_void()+
  theme(panel.spacing.x = unit(0, "mm"),)+
  facet_grid(.~Substitution, scales = "free_x")+
  scale_fill_manual(values=palette.COSMIC.SNV.192)+
  theme(legend.position = "none")+
  theme(strip.text.x=element_text(size=40))

h2

legend <- plot_grid(get_legend(h2), get_legend(h1), ncol = 1)
h1 <- h1 + theme(legend.position = "none")
h2 <- h2 + theme(legend.position = "none")

mix_kat_rate_plot_watson_v2<-plot_grid(h2, h1, align = "v", ncol = 1, axis = "tblr", rel_heights = c(0.5, 5))
mix_kat_rate_plot_watson_v2

save_plot("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/mix_kat_192_watson.v2.pdf",mix_kat_rate_plot_watson_v2,
          ncol=1,
          nrow=2,
          base_asp=7,
          #unit="px",
          
          device=cairo_pdf)

h1<-mix_kat_count_sum_input_df_v2%>%
  #t_df%>%arrange(region)
  filter(main_ref=="crick")%>%
  ggplot()+
  #geom_hline(yintercept=seq(0,150,by=50),
  #           linetype="dotted",col="grey")+
  geom_rect(aes(xmin = as.numeric(type_3) - 0.5, xmax = as.numeric(type_3) + 0.5, ymin = 0, ymax = 10,  fill = Substitution),alpha=0.06)+
  geom_bar(mapping=aes(x=type_3,y=count,fill=region),
           stat="identity",
           position="dodge",
           width=0.5)+
  #  guides(fill = guide_legend(ncol = 2))+
  theme(#axis.text.x.bottom = element_blank(),
    #axis.ticks.x = element_blank(),
    axis.ticks.y=element_line(size=1,colour="grey"),
    axis.ticks.length.y=unit(-0.25,"cm"),
    panel.spacing.x = unit(0, "mm"),
    axis.title.x = element_blank(),
    strip.background.x = element_blank(),
    strip.text.x = element_blank())+
  #facet_grid(.~Substitution, scales = "free_x")+
  theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1,size=20,family="Consolas"),
        axis.text.y=element_text(size=20),
        axis.title.y=element_text(size=20)
  )+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill=NA, size=1),
  )+
  scale_x_continuous(
    breaks = c(1:96),
    labels = contextorder96,
    expand = c(0.01, 0)
  )+
  #scale_y_log10()+
  scale_y_continuous(
    limits=c(0,10),
    expand=c(0,0),
    trans=scales::pseudo_log_trans(base = 10,sigma = 0.001),
    breaks=c(0,0.001,0.01,0.1,1,10),
    labels=c(0,0.001,0.01,0.1,1,10)
  )+
  xlab("")+ylab("")+
  #theme(legend.position = "none")+
  scale_fill_manual(values=c(palette.COSMIC.SNV.192[12:7],col_c6[1:2]))+
  scale_colour_manual(values=c(col_c6))+
  
  ylab("# of SNvs")
h1
#  scale_colour_manual(values=c(palette.COSMIC.SNV.96,col_c6))

h1
h2<-ggplot(mix_kat_count_sum_input_df_v2%>%
             #t_df%>%arrange(region)
             filter(main_ref!="watson"))+
  geom_bar(mapping = aes(x = type_3, y = 1, fill = Substitution),
           stat = "identity",
           width = 1)+
  theme_void()+
  theme(panel.spacing.x = unit(0, "mm"),)+
  facet_grid(.~Substitution, scales = "free_x")+
  scale_fill_manual(values=palette.COSMIC.SNV.192[7:12])+
  theme(legend.position = "none")+
  theme(strip.text.x=element_text(size=40))

h2

legend <- plot_grid(get_legend(h2), get_legend(h1), ncol = 1)
h1 <- h1 + theme(legend.position = "none")
h2 <- h2 + theme(legend.position = "none")

mix_kat_rate_plot_crick_v2<-plot_grid(h2, h1, align = "v", ncol = 1, axis = "tblr", rel_heights = c(0.5, 5))
mix_kat_rate_plot_crick_v2

save_plot("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/mix_kat_192_crick.v2.pdf",mix_kat_rate_plot_crick_v2,
          ncol=1,
          nrow=2,
          base_asp=7,
          #unit="px",
          
          device=cairo_pdf)






h1<-mix_kat_count_sum_input_df%>%
#t_df%>%arrange(region)
  filter(main_ref=="watson")%>%
ggplot()+
  #geom_hline(yintercept=seq(0,150,by=50),
  #           linetype="dotted",col="grey")+
  geom_rect(aes(xmin = as.numeric(type_3) - 0.5, xmax = as.numeric(type_3) + 0.5, ymin = 0, ymax = 2000,  fill = Substitution),alpha=0.06)+
  geom_bar(mapping=aes(x=type_3,y=count,fill=region),
           stat="identity",
           position="dodge",
           width=0.5)+
  #  guides(fill = guide_legend(ncol = 2))+
  theme(#axis.text.x.bottom = element_blank(),
    #axis.ticks.x = element_blank(),
    axis.ticks.y=element_line(size=1,colour="grey"),
    axis.ticks.length.y=unit(-0.25,"cm"),
    panel.spacing.x = unit(0, "mm"),
    axis.title.x = element_blank(),
    strip.background.x = element_blank(),
    strip.text.x = element_blank())+
  #facet_grid(.~Substitution, scales = "free_x")+
  theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1,size=20,family="Consolas"),
        axis.text.y=element_text(size=20),
        axis.title.y=element_text(size=20)
  )+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill=NA, size=1),
  )+
  scale_x_continuous(
    breaks = c(1:96),
    labels = contextorder96,
    expand = c(0.01, 0)
  )+
  #scale_y_log10()+
  scale_y_continuous(
    limits=c(0,2000),
    expand=c(0,0),
    trans=scales::pseudo_log_trans(base = 10,sigma = 0.1),
    breaks=c(0,1,10,100,1000,2000),
    labels=c(0,1,10,100,1000,2000)
  )+
  xlab("")+ylab("")+
  #theme(legend.position = "none")+
  scale_fill_manual(values=c(palette.COSMIC.SNV.192[1:3],col_c6[2:1],palette.COSMIC.SNV.192[4:6]))+
  scale_colour_manual(values=c(col_c6))+
  
  ylab("mutation rate / 1Mb")

#  scale_colour_manual(values=c(palette.COSMIC.SNV.96,col_c6))

h1
h2<-ggplot(mix_kat_count_sum_input_df%>%
             #t_df%>%arrange(region)
             filter(main_ref=="watson"))+
  geom_bar(mapping = aes(x = type_3, y = 1, fill = Substitution),
           stat = "identity",
           width = 1)+
  theme_void()+
  theme(panel.spacing.x = unit(0, "mm"),)+
  facet_grid(.~Substitution, scales = "free_x")+
  scale_fill_manual(values=palette.COSMIC.SNV.192)+
  theme(legend.position = "none")+
  theme(strip.text.x=element_text(size=40))

h2

legend <- plot_grid(get_legend(h2), get_legend(h1), ncol = 1)
h1 <- h1 + theme(legend.position = "none")
h2 <- h2 + theme(legend.position = "none")

mix_kat_rate_plot_watson<-plot_grid(h2, h1, align = "v", ncol = 1, axis = "tblr", rel_heights = c(0.5, 5))


save_plot("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/mix_kat_192_watson.pdf",mix_kat_rate_plot_watson,
          ncol=1,
          nrow=2,
          base_asp=7,
          #unit="px",
          
          device=cairo_pdf)


h1<-mix_kat_count_sum_input_df%>%
  #t_df%>%arrange(region)
  filter(main_ref!="watson")%>%
  ggplot()+
  #geom_hline(yintercept=seq(0,150,by=50),
  #           linetype="dotted",col="grey")+
  geom_rect(aes(xmin = as.numeric(type_3) - 0.5, xmax = as.numeric(type_3) + 0.5, ymin = 0, ymax = 2000,  fill = Substitution),alpha=0.06)+
  geom_bar(mapping=aes(x=type_3,y=count,fill=region),
           stat="identity",
           position="dodge",
           width=0.5)+
  #  guides(fill = guide_legend(ncol = 2))+
  theme(#axis.text.x.bottom = element_blank(),
    #axis.ticks.x = element_blank(),
    axis.ticks.y=element_line(size=1,colour="grey"),
    axis.ticks.length.y=unit(-0.25,"cm"),
    panel.spacing.x = unit(0, "mm"),
    axis.title.x = element_blank(),
    strip.background.x = element_blank(),
    strip.text.x = element_blank())+
  #facet_grid(.~Substitution, scales = "free_x")+
  theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1,size=20,family="Consolas"),
        axis.text.y=element_text(size=20),
        axis.title.y=element_text(size=20)
  )+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill=NA, size=1),
  )+
  scale_x_continuous(
    breaks = c(1:96),
    labels = (mix_kat_count_sum_input_df$mut_sig_cont%>%unique())[1:96],
    expand = c(0.01, 0)
  )+
  #scale_y_log10()+
  scale_y_continuous(
    limits=c(0,2000),
    expand=c(0,0),
    trans=scales::pseudo_log_trans(base = 10,sigma = 0.1),
    breaks=c(0,1,10,100,1000,2000),
    labels=c(0,1,10,100,1000,2000)
  )+
  xlab("")+ylab("")+
  #theme(legend.position = "none")+
  scale_fill_manual(values=c(rev(palette.COSMIC.SNV.192[7:12]),col_c6[2:1]))+
  scale_colour_manual(values=c(col_c6))+
  
  ylab("mutation rate / 1Mb")

h1
h2<-ggplot(mix_kat_count_sum_input_df%>%
             #t_df%>%arrange(region)
             filter(main_ref!="watson"))+
  geom_bar(mapping = aes(x = type_3, y = 1, fill = Substitution),
           stat = "identity",
           width = 1)+
  theme_void()+
  theme(panel.spacing.x = unit(0, "mm"),)+
  facet_grid(.~Substitution, scales = "free_x")+
  scale_fill_manual(values=palette.COSMIC.SNV.192[7:12])+
  theme(legend.position = "none")+
  theme(strip.text.x=element_text(size=40))



legend <- plot_grid(get_legend(h2), get_legend(h1), ncol = 1)
h1 <- h1 + theme(legend.position = "none")
h2 <- h2 + theme(legend.position = "none")

mix_kat_rate_plot_crick<-plot_grid(h2, h1, align = "v", ncol = 1, axis = "tblr", rel_heights = c(0.5, 5))

save_plot("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/mix_kat_192_crick.pdf",mix_kat_rate_plot_crick,
          ncol=1,
          nrow=2,
          base_asp=7,
          #unit="px",
          
          device=cairo_pdf)


mix_kat_count_sum_df%>%
  mutate(Substitution=factor(substr(mut_sig_cont,2,4),levels=c("C>A","C>G","C>T","T>A","T>C","T>G","G>T","G>C","G>A","A>T","A>G","A>C")))%>%
  arrange(Substitution)%>%
  filter(mix_kat_mut_cont_sum>0)%>%
  ungroup()%>%
  select(mix_kat_mut_cont_sum)%>%sum()
  


mix_kat_count_sum_df%>%mutate(mut_sig_cont=as.character(mut_sig_cont))%>%
  mutate(Trinucleotide=ifelse(REF%in%c("C","T"),sig_cont,stri_reverse(chartr("ACGT","TGCA",sig_cont))))%>%
  mutate(cmpl_sig_cont=ifelse(REF%in%c("C","T"),mut_sig_cont,paste0(stri_reverse(chartr("ACGT","TGCA",substr(mut_sig_cont,5,5))),
                             stri_reverse(chartr("ACGT","TGCA",substr(mut_sig_cont,2,2))),
                             ">",
                             stri_reverse(chartr("ACGT","TGCA",substr(mut_sig_cont,4,4))),
                             stri_reverse(chartr("ACGT","TGCA",substr(mut_sig_cont,1,1))))
  ))%>%
  #filter(cmpl_sig_cont=="TC>TT")%>%
  mutate(Substitution=substr(cmpl_sig_cont,2,4))%>%
  select(Substitution,Trinucleotide,mix_kat_mut_cont_sum)%>%ungroup()%>%
  group_by(Substitution,Trinucleotide)%>%
  dplyr::summarise(Count=sum(mix_kat_mut_cont_sum))%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/mix_strand_count.txt",
              quote=F,
              row.names=F,
              sep="\t")



chisq_tmp<-lapply((mix_kat_count_sum_df$mut_sig_cont),function(x){
  #x=(mix_kat_count_sum_df$mut_sig_cont)[1]
  df<-mix_kat_count_sum_df%>%filter(mut_sig_cont==x)
  df2<-data.frame(pval=(chisq.test(rbind(c(df$mix_kat_mut_cont_sum,df$mix_kat_ref_cont_count_sum),c(df$mix_kat_out_mut_count_sum,df$mix_kat_out_ref_cont_count_sum))))$p.value,mut_sig_cont=x)%>%as.tibble()
}
)
chisq_df<-do.call(rbind,chisq_tmp)

chi_c_pval<-chisq_df$pval

p.adjust(chi_c_pval, method = "BH", n = length(chi_c_pval))

#######

mix_kat_df<-left_join(mix_kat_df,mix_kat_df%>%
                        filter(REF==main_ref&grepl("TC>",sig_cont))%>%
                        group_by(info)%>%
                        dplyr::summarise(min_pos=min(POS),
                                         max_pos=max(POS))
)%>%
  left_join(cut_df)


mix_kat_range_df<-mix_kat_df%>%select(id,`#CHROM`,POS,REF,ALT,sig_cont,info,main,min_pos,max_pos,IMD_cutoff)%>%
  mutate(cor_min_pos=ifelse(info=="A3A_1st_C3_3ug-2_2_7",min_pos-IMD_cutoff,
                            ifelse(info=="A3A_1st_C3_3ug-2_2_17",min_pos-2*IMD_cutoff,
                                   ifelse(info=="A3A_C3_3ug-5_5_2",min_pos-IMD_cutoff,
                                          ifelse(info=="A3A_C3_3ug-5_22_5",min_pos-2*IMD_cutoff,
                                                 ifelse(info=="A3A_C3_TP53_C3_3ug-2_16_2",min_pos-5*IMD_cutoff,"NA"))))))%>%
  mutate(cor_max_pos=ifelse(info=="A3A_1st_C3_3ug-2_2_7",max_pos+IMD_cutoff,
                            ifelse(info=="A3A_1st_C3_3ug-2_2_17",max_pos+2*IMD_cutoff,
                                   ifelse(info=="A3A_C3_3ug-5_5_2",max_pos+IMD_cutoff,
                                          ifelse(info=="A3A_C3_3ug-5_22_5",max_pos+2*IMD_cutoff,
                                                 ifelse(info=="A3A_C3_TP53_C3_3ug-2_16_2",max_pos+5*IMD_cutoff,"NA"))))))


library(Rsamtools)

fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')
gene_gr<-GRanges(seqnames=mix_kat_range_df$`#CHROM`,IRanges(start=(as.numeric(mix_kat_range_df$cor_min_pos)-1), end=(as.numeric(mix_kat_range_df$cor_max_pos)+1)),strand="+")
gene_seq<-getSeq(fasta_file,reduce(gene_gr))

TCA_count<-lapply(as.data.frame(gene_seq)$x,function(x){
  rbind(
    str_count(x,"TC")%>%as.tibble()%>%mutate(sig_cont="TCN"),
    str_count(x,"GA")%>%as.tibble()%>%mutate(sig_cont="NGA")
  )
}
)

mix_kat_count_df<-do.call(rbind,TCA_count)%>%
  mutate(info=rep(mix_kat_range_df$info%>%unique(),each=2))


mix_kat_df<-mix_kat_df%>%
  mutate(minor_count=ifelse(main=="C",tot_G,tot_C),
         main_count=ifelse(main=="C",tot_C,tot_G))

tot_TCN_df<-merge_df%>%mutate(TCN=ifelse(grepl("TC>",sig_cont)&REF=="C",1,0),
                              NGA=ifelse(grepl("TC>",sig_cont)&REF=="G",1,0))%>%
  group_by(id)%>%dplyr::summarise(TCN_count=sum(TCN),NGA_count=sum(NGA))%>%
  left_join(mix_kat_df%>%select(id,info,main))%>%
  filter(!is.na(main))%>%unique()%>%
  mutate(cont_count=ifelse(main=="G",TCN_count,NGA_count))


mix_kat_sum_df<-left_join(
  mix_kat_df%>%
    select(info,minor_count)%>%unique(),
  mix_kat_count_df%>%
    spread(sig_cont,value)%>%
    left_join(mix_kat_range_df%>%select(main,info))%>%
    mutate(count=ifelse(main=="C",TCN,NGA))%>%
    mutate(tot_count=ifelse(main=="C",TC_count,GA_count))%>%
    mutate(out_count=tot_count-count)
)%>%
  left_join(tot_TCN_df%>%select(info,cont_count))%>%
  unique()






library(BSgenome.Hsapiens.Ensembl.GRCh37.jolim)
c<-getSeq(BSgenome.Hsapiens.Ensembl.GRCh37.jolim,c(as.character(c(1:22)),"X"))
g_count1<-str_count(c,"TC")
TC_count<-g_count1%>%sum()
g_count2<-str_count(c,"GA")
GA_count<-g_count2%>%sum()


mix_kat_df<-mix_kat_df%>%
  mutate(minor_count=ifelse(main=="C",tot_G,tot_C),
         main_count=ifelse(main=="C",tot_C,tot_G))

tot_TCN_df<-merge_df%>%mutate(TCN=ifelse(grepl("TC>",sig_cont)&REF=="C",1,0),
                              NGA=ifelse(grepl("TC>",sig_cont)&REF=="G",1,0))%>%
  group_by(id)%>%dplyr::summarise(TCN_count=sum(TCN),NGA_count=sum(NGA))%>%
  left_join(mix_kat_df%>%select(id,info,main))%>%
  filter(!is.na(main))%>%unique()%>%
  mutate(cont_count=ifelse(main=="G",TCN_count,NGA_count))


mix_kat_sum_df<-left_join(
  mix_kat_df%>%
    select(info,minor_count)%>%unique(),
  mix_kat_count_df%>%
    spread(sig_cont,value)%>%
    left_join(mix_kat_range_df%>%select(main,info))%>%
    mutate(count=ifelse(main=="C",TCN,NGA))%>%
    mutate(tot_count=ifelse(main=="C",TC_count,GA_count))%>%
    mutate(out_count=tot_count-count)
)%>%
  left_join(tot_TCN_df%>%select(info,cont_count))%>%
  unique()


fisher_tmp<-lapply(mix_kat_sum_df$info%>%unique(),function(x){
  df<-mix_kat_sum_df%>%filter(info==x)
  df2<-data.frame(pval=(fisher.test(rbind(c(df$minor_count,df$count),c(df$cont_count-df$minor_count,df$out_count))))$p.value,info=x)%>%as.tibble()
  
}
)


kat_count_sum_df<-kat_count_df%>%group_by(info)%>%dplyr::summarise(count_sum=sum(value))%>%
  mutate(tot_count=VC_count+GB_count)%>%
  mutate(out_count_sum=tot_count-count_sum)


prop.table(c(416331,68824492689))

kat_count_sum_df<-left_join(kat_count_sum_df%>%mutate(id=gsub("_[0-9]*_[0-9]*$","",info)),merge_df%>%mutate(VCN=ifelse(grepl("[ACG]C>",sig_cont),1,0))%>%group_by(id)%>%dplyr::summarise(VCN_count=sum(VCN)))


(kat_count_sum_df%>%filter(info%in%(VCN_df%>%filter(!info%in%excl_info))$info))$info
x<-"A3A_1st_C3_100ng-1_1_2"
fisher_tmp<-lapply((kat_count_sum_df%>%filter(info%in%(VCN_df%>%filter(!info%in%excl_info))$info))$info,function(x){
  df<-kat_count_sum_df%>%filter(info==x)
  #df2<-data.frame(pval=(chisq.test(c(1,df$count_sum),simulate.p.value=TRUE,B=10000,p=prop.table(c(df$VCN_count,df$out_count_sum))))$p.value,info=x)%>%as.tibble()
  df2<-data.frame(pval=(fisher.test(rbind(c(1,df$count_sum),c(df$VCN_count,df$out_count_sum))))$p.value,info=x)%>%as.tibble()
  #df2<-data.frame(pval=fisher.test(rbind(c(1,df$count_sum),c(df$VCN_count,df$out_count_sum)))$p.value,info=x)%>%as.tibble()
}
)
fisher_df<-do.call(rbind,fisher_tmp)


c_pval<-(fisher_df%>%
           filter(info%in%VCN_df$info)%>%filter(!info%in%excl_info))$pval
(p.adjust(c_pval, method = "BH", n = length(c_pval))<0.05)%>%table()

#######mixed-kataegis######
###########################




######
#####
metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")

#metadata
cl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")
#cl_df
cl_df%>%dplyr::select(samples,imd_cutoff)%>%unique()%>%arrange(imd_cutoff)
files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
                          ".*prob.clonal.*.cont$",
                          full.names=T)
#files_to_read

vcf_tmp[[1]]
t_df<-read_tsv(files_to_read[1])
t_df%>%colnames()
vcf_tmp<-lapply(files_to_read,function(x){
  read_tsv(x)%>%mutate(id=gsub(".mutect2.*","",basename(x)))%>%
    dplyr::select(id,`#CHROM`,POS,REF,ALT,sig_cont,Func_refGene,Gene_refGene)
  
})
vcf_tmp[[1]]
merge_df<-do.call(rbind,vcf_tmp)
merge_df<-merge_df%>%filter(grepl("[ATGC][ATGC]>[ATGC][ATGC]",sig_cont))

merge_df<-left_join(merge_df,metadata%>%select(-`m/d`))

cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)
merge_df<-left_join(merge_df,cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)%>%plyr::rename(c("chr"="#CHROM","start"="POS","samples"="id")))
merge_df<-merge_df%>%mutate(IMD=ifelse(is.na(IMD),0,IMD),cluster_id=ifelse(is.na(cluster_id),".",cluster_id),cluster=ifelse(is.na(cluster_type_omikli_upto_3),"non-clust",cluster_type_omikli_upto_3))%>%select(-cluster_type_omikli_upto_3)

merge_df$dose%>%unique()

A3A_cl_merge_df<-merge_df%>%filter(dose%in%c("3ug","100ng"))%>%filter(APOBEC=="A3A")

A3A_cl_merge_df<-A3A_cl_merge_df%>%mutate(APOBEC="A3A")
A3A_cl_merge_f_df<-left_join(
  A3A_cl_merge_df,
  A3A_cl_merge_df%>%group_by(id,`#CHROM`,APOBEC,dose,TP53,cluster_id,cluster)%>%dplyr::summarise(AMS=sum(grepl("TC>[GT][AT]",sig_cont)))%>%filter(cluster!="non-clust")
)%>%filter(AMS>=2)

A3A_cl_merge_f_df<-left_join(A3A_cl_merge_f_df,A3A_cl_merge_f_df%>%group_by(id,`#CHROM`,cluster_id,cluster)%>%dplyr::summarise(no_snv=n()))

excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/exclude_cl_mut_df.v2.txt")
excl_df%>%filter(grepl("A3A",id))
overlap_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/overlapped_cluster.v2.edit.txt")%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))
#overlap_ex
miss_phs_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/miss_phased_cluster.txt")%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))

#miss_phs_excl_df<-miss_phs_excl_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))

A3A_cl_merge_f_df<-A3A_cl_merge_f_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))
excl_df%>%filter(grepl("A3A",id))
A3A_cl_merge_f_excl_df<-A3A_cl_merge_f_df%>%filter(!info%in%excl_df$info)%>%filter(!info%in%overlap_excl_df$info)%>%filter(!info%in%miss_phs_excl_df$info)
A3A_cl_merge_f_excl_df%>%filter(cluster=="kataegis")%>%select(info)%>%unique()


cont_sum_df<-A3A_cl_merge_f_excl_df%>%filter(cluster=="kataegis")%>%unique()%>%
  mutate(TCN=ifelse(grepl("TC>",sig_cont),1,0),
         VCN=ifelse(grepl("[AGC]C>",sig_cont),1,0),
         nonC=ifelse(!grepl("C>",sig_cont),1,0))%>%
  group_by(info)%>%
  dplyr::summarise(tot_TCN=sum(TCN),
                   tot_VCN=sum(VCN),
                   tot_nonC=sum(nonC))

VCN_df<-A3A_cl_merge_f_excl_df%>%filter(cluster=="kataegis")%>%unique()%>%
  mutate(TCN=ifelse(grepl("TC>",sig_cont),1,0),
         VCN=ifelse(grepl("[AGC]C>",sig_cont),1,0),
         nonC=ifelse(!grepl("C>",sig_cont),1,0))%>%
  group_by(info)%>%
  dplyr::summarise(tot_TCN=sum(TCN),
                   tot_VCN=sum(VCN),
                   tot_nonC=sum(nonC))


A3A_cl_merge_f_excl_df%>%filter(cluster=="kataegis")%>%unique()



##exclude cluster
#A3A_1st_C3_3ug-2_10_10
#A3A_C3_TP53_C3_100ng-3_10_2
excl_info<-c("A3A_1st_C3_3ug-2_10_10","A3A_C3_TP53_C3_100ng-3_10_2","A3A_C3_TP53_C3_3ug-2_4_8")

cut_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/imds/APOBEC_clustered_mutations_imd_cutoff.tsv")%>%
  plyr::rename(c("Sample"="id"))
range_df<-A3A_cl_merge_f_excl_df%>%filter(cluster=="kataegis")%>%unique()%>%
  group_by(id,`#CHROM`,cluster_id,info)%>%
  dplyr::summarise(min_pos=min(POS),max_pos=max(POS))

## edit range_df
range_df

library(Rsamtools)
fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')
gene_gr<-GRanges(seqnames=range_df$`#CHROM`,IRanges(start=(as.numeric(range_df$min_pos)-1), end=(as.numeric(range_df$max_pos)+1)),strand="+")
gene_seq<-getSeq(fasta_file,reduce(gene_gr))
TCA_count[[1]]
pos_cont<-paste0(rep(rep(c("A","C","G","T"),each=4),2),rep(c("C","T"),each=16),rep(c("A","C","G","T"),8))
library(stringr)
library(stringi)
rev_cont<-stri_reverse(chartr("ACGT","TGCA",pos_cont))
cont_c<-c(pos_cont,rev_cont)
kat_cont_count_tmp<-
  lapply(as.data.frame(gene_seq)$x,function(x){
    tmp<-lapply(cont_c,function(i){
      str_count(x,i)%>%as.tibble()%>%mutate(sig_cont=i)
    })
    df<-do.call(rbind,tmp)
  })

kat_cont_count_df<-do.call(rbind,kat_cont_count_tmp)
kat_cont_count_df$sig_cont<-factor(kat_cont_count_df$sig_cont,levels=cont_c)

kat_cont_count_df<-kat_cont_count_df%>%mutate(info=rep(range_df$info,each=64))

kat_cont_count_sum_df<-kat_cont_count_df%>%
  group_by(sig_cont)%>%
  dplyr::summarise(n=sum(value))
kat_cont_count_sum_df

write.table(kat_cont_count_df,"/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/kat_cont_count_df.txt",
            sep="\t",
            quote=F,
            row.name=F)
library(BSgenome.Hsapiens.Ensembl.GRCh37.jolim)
c<-getSeq(BSgenome.Hsapiens.Ensembl.GRCh37.jolim,c(as.character(c(1:22)),"X"))
ref_cont_count_tmp<-lapply(cont_c,function(i){
  str_count(c,i)%>%as.tibble()%>%mutate(sig_cont=i)
  
})
ref_cont_count_df<-do.call(rbind,ref_cont_count_tmp)
write.table(ref_cont_count_df,"/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/ref_cont_count_df.txt",
            sep="\t",
            quote=F,
            row.name=F)
## tot VCN mut_count
#merge_df%>%mutate(VCN=ifelse(grepl("[ACG]C>",sig_cont),1,0))%>%group_by(id)%>%dplyr::summarise(VCN_count=sum(VCN))
## tot len_count

contextorder96 <- paste0(
  rep(rep(c("A","C","G","T"),each=4),4), #firstbase
  rep(c("C","T"),each=48),
  ">",
  rep(c("A","G","T","A","C","G"), each=16),
  rep(c("A","C","G","T"),16) # lastbase
)

ref_32_df<-data.frame(mut_sig_cont=contextorder96)%>%as.tibble()%>%
  mutate(sig_cont=paste0(substr(mut_sig_cont,1,2),substr(mut_sig_cont,5,5)))

kat_mut_count_df<-
  A3A_cl_merge_f_excl_df%>%filter(cluster=="kataegis")%>%unique()%>%
  plyr::rename(c("sig_cont"="mut_sig_cont"))%>%
  mutate(sig_cont=paste0(substr(mut_sig_cont,1,2),substr(mut_sig_cont,5,5)))%>%
  group_by(sig_cont,mut_sig_cont)%>%
  dplyr::summarise(kat_mut_count=n())


kat_mut_count_ref_df<-left_join(ref_32_df,kat_mut_count_df)
kat_mut_count_ref_df[is.na(kat_mut_count_ref_df)]<-0


kat_mut_count_ref_df<-left_join(kat_mut_count_ref_df,kat_cont_count_sum_df%>%plyr::rename(c("n"="kat_ref_cont_count")))

kat_mut_count_ref_df%>%filter(grepl("^TC",sig_cont))

ref_cont_count_sum_df<-ref_cont_count_df%>%mutate(sig_cont=ifelse(substr(sig_cont,2,2)%in%c("C","T"),sig_cont,stri_reverse(chartr("ACGT","TGCA",sig_cont))))%>%
  group_by(sig_cont)%>%
  dplyr::summarise(tot_ref_cont_count=sum(value))


tot_mut_count_ref_df<-left_join(kat_mut_count_ref_df,ref_cont_count_sum_df)
tot_mut_count_ref_df%>%filter(sig_cont=="ACA")

id_count<-A3A_cl_merge_f_df$id%>%unique()%>%length()

tot_mut_count_ref_count_df<-left_join(tot_mut_count_ref_df%>%mutate(kat_out_ref_cont_count=tot_ref_cont_count-kat_ref_cont_count),
                                      
                                      merge_df%>%filter(id%in%(A3A_cl_merge_f_df$id%>%unique()))%>%
                                        group_by(sig_cont)%>%
                                        dplyr::summarise(tot_mut_count=n())%>%
                                        plyr::rename(c("sig_cont"="mut_sig_cont"))
)%>%mutate(kat_out_mut_count=tot_mut_count-kat_mut_count)
tot_mut_count_ref_count_df<-tot_mut_count_ref_count_df%>%mutate(kat_rate=kat_mut_count/kat_ref_cont_count,
                                                                tot_rate=kat_out_mut_count/(kat_out_ref_cont_count*id_count))

tot_mut_count_ref_count_sum_df<-tot_mut_count_ref_count_df%>%
  select(mut_sig_cont,kat_rate,tot_rate)%>%
  gather(region,rate,kat_rate:tot_rate)
tot_mut_count_ref_count_sum_df$mut_sig_cont<-factor(tot_mut_count_ref_count_sum_df$mut_sig_cont,levels=contextorder96)
tot_mut_count_ref_count_sum_df%>%spread(region,rate)
tot_mut_count_ref_count_sum_df%>%filter(!grepl("TC>",mut_sig_cont))%>%
  ggplot(aes(x=mut_sig_cont,y=rate*1000000000,col=region))+
  geom_bar(stat="identity",position="dodge")

fisher_tmp<-lapply(tot_mut_count_ref_count_df$mut_sig_cont,function(x){
  df<-tot_mut_count_ref_count_df%>%filter(mut_sig_cont==x)
  df2<-data.frame(pval=(fisher.test(rbind(c(df$kat_mut_count,df$kat_ref_cont_count),c(df$kat_out_mut_count,(df$kat_out_ref_cont_count)*id_count))))$p.value,sig_cont=x)%>%as.tibble()
})

fisher_df<-do.call(rbind,fisher_tmp)  
c_pval<-fisher_df$pval

pval_df<-p.adjust(c_pval, method = "BH", n = length(c_pval))%>%as.tibble()%>%cbind(sig_cont=contextorder96)

VCN_df%>%filter(!info%in%excl_info)%>%arrange(-tot_VCN)
kat_count_sum_df
prop.table(c(416331,68824492689))
kat_count_sum_df%>%
  dplyr::summarise(tot_count_sum=sum(count_sum),
                   out_count_sum=sum(out_count_sum))
kat_count_sum_df<-left_join(kat_count_sum_df%>%mutate(id=gsub("_[0-9]*_[0-9]*$","",info)),merge_df%>%mutate(VCN=ifelse(grepl("[ACG]C>",sig_cont),1,0))%>%group_by(id)%>%dplyr::summarise(VCN_count=sum(VCN)))
#fisher.test(rbind(c(416331,68824492689),c(4103,655471221)))
#?chisq.
fisher.test()
(kat_count_sum_df%>%filter(info%in%(VCN_df%>%filter(!info%in%excl_info))$info))$info
x<-"A3A_1st_C3_100ng-1_1_2"
kat_count_sum_df<-left_join(
  kat_count_sum_df,
  index_df%>%mutate(VCN=ifelse(grepl("[AGC]C>",sig_cont),1,0))%>%
    group_by(info)%>%
    dplyr::summarise(tot_VCN=sum(VCN))
)
kat_count_sum_df
fisher_tmp<-lapply((kat_count_sum_df%>%filter(info%in%(VCN_df$info)))$info,function(x){
  df<-kat_count_sum_df%>%filter(info==x)
  #df2<-data.frame(pval=(chisq.test(c(1,df$count_sum),simulate.p.value=TRUE,B=10000,p=prop.table(c(df$VCN_count,df$out_count_sum))))$p.value,info=x)%>%as.tibble()
  df2<-data.frame(pval=(fisher.test(rbind(c(df$tot_VCN,df$count_sum),c(df$VCN_count-df$tot_VCN,df$out_count_sum))))$p.value,info=x)%>%as.tibble()
  #df2<-data.frame(pval=fisher.test(rbind(c(1,df$count_sum),c(df$VCN_count,df$out_count_sum)))$p.value,info=x)%>%as.tibble()
}
)
fisher_df<-do.call(rbind,fisher_tmp)
chi_df
index_df%>%filter(info=="A3A_C3_TP53_C3_100ng-3_10_2")
fisher_df%>%
  filter(info%in%VCN_df$info)%>%
  arrange(-pval)

c_pval<-(fisher_df%>%
           filter(info%in%VCN_df$info))$pval


log10(c_pval)
c_pval
library(ggforce)
c_pval%>%max()





count_ref_df<-data.frame(group=c(1:8))%>%as.tibble()


p_count_df<-left_join(count_ref_df,p.adjust(c_pval, method = "BH", n = length(c_pval))%>%as.tibble()%>%
                        mutate(group=ifelse(value>=0.01,1,
                                            ifelse(value>=0.001,2,
                                                   ifelse(value>=0.0001,3,
                                                          ifelse(value>=0.00001,4,
                                                                 ifelse(value>=0.000001,5,
                                                                        ifelse(value>=0.0000001,6,
                                                                               ifelse(value>=0.00000001,7,
                                                                                      ifelse(value>=0.000000001,8,
                                                                                             9)))))))))%>%
                        group_by(group)%>%
                        dplyr::summarise(n=n())
)


p_count_df[is.na(p_count_df)]<-0
p_count_df$group<-factor(p_count_df$group,levels=c(1:8))
p_mix_bar<-p_count_df%>%
  ggplot(aes(x=group,y=n,col="black"))+
  geom_bar(stat="identity")+
  guides(colour="none")+
  theme_classic()
p_mix_bar
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/p_dist_mix_kat.bar.pdf",p_mix_bar,
       height=8,width=10)  


p_mix<-p.adjust(c_pval, method = "BH", n = length(c_pval))%>%as.tibble()%>%
  ggplot(aes(x=value))+
  geom_histogram()+
  scale_x_continuous(trans = trans_reverser('log10'),
                     breaks=c(0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.05),
                     lim=c(0.05,0.0000001)
  )+
  theme_bw()
p_mix
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/p_dist_VCN_kat.pdf",p_mix,
       height=8,width=10)  

#######mixed-kataegis######
###########################

mix_c<-c("A3A_C3_TP53_C3_3ug-3_1_5",
         "A3A_C3_3ug-5_5_2",
         "A3A_1st_C3_3ug-2_2_7",
         "A3A_1st_C3_3ug-2_2_17",
         "A3A_C3_3ug-5_22_5"
)

mix_c

