library(dplyr)
library(tidyverse)
library(ggplot2)
library(MASS)
#KO
##A3A mediated clsuter event##

metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")

#metadata
cl_df<-read_tsv("/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")
#cl_df
cl_df%>%dplyr::select(samples,imd_cutoff)%>%unique()%>%arrange(imd_cutoff)
files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
                          ".excl.*gd$",
                          full.names=T)
#files_to_read

vcf_tmp[[1]]
t_df<-read_tsv(files_to_read[1])
t_df%>%colnames()
vcf_tmp<-lapply(files_to_read,function(x){
  read_tsv(x)%>%mutate(id=gsub(".mutect2.*","",basename(x)))%>%
    dplyr::select(id,`#CHROM`,POS,REF,ALT,sig_cont,Func_refGene,Gene_refGene)
  
})
vcf_tmp[[1]]
merge_df<-do.call(rbind,vcf_tmp)
merge_df<-merge_df%>%filter(grepl("[ATGC][ATGC]>[ATGC][ATGC]",sig_cont))

merge_df<-left_join(merge_df,metadata%>%select(-`m/d`))

cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)
merge_df<-left_join(merge_df,cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)%>%plyr::rename(c("chr"="#CHROM","start"="POS","samples"="id")))
merge_df<-merge_df%>%mutate(IMD=ifelse(is.na(IMD),0,IMD),cluster_id=ifelse(is.na(cluster_id),".",cluster_id),cluster=ifelse(is.na(cluster_type_omikli_upto_3),"non-clust",cluster_type_omikli_upto_3))%>%select(-cluster_type_omikli_upto_3)

merge_df$dose%>%unique()

A3A_cl_merge_df<-merge_df%>%filter(dose%in%c("3ug","100ng"))%>%filter(APOBEC=="A3A")

A3A_cl_merge_df<-A3A_cl_merge_df%>%mutate(APOBEC="A3A")
A3A_cl_merge_f_df<-left_join(
  A3A_cl_merge_df,
  A3A_cl_merge_df%>%group_by(id,`#CHROM`,APOBEC,dose,TP53,cluster_id,cluster)%>%dplyr::summarise(AMS=sum(grepl("TC>[GT][AT]",sig_cont)))%>%filter(cluster!="non-clust")
)%>%filter(AMS>=2)

A3A_cl_merge_f_df<-left_join(A3A_cl_merge_f_df,A3A_cl_merge_f_df%>%group_by(id,`#CHROM`,cluster_id,cluster)%>%dplyr::summarise(no_snv=n()))

excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/exclude_cl_mut_df.v2.txt")

overlap_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/overlapped_cluster.v2.txt")
#overlap_ex
miss_phs_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/miss_phased_cluster.txt")

#miss_phs_excl_df<-miss_phs_excl_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))

A3A_cl_merge_f_df<-A3A_cl_merge_f_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))
excl_df%>%filter(grepl("A3A",id))
A3A_cl_merge_f_excl_df<-A3A_cl_merge_f_df%>%filter(!info%in%excl_df$info)%>%filter(!info%in%overlap_excl_df$info)%>%filter(!info%in%miss_phs_excl_df$info)
A3A_cl_merge_f_excl_df
#####


#conflicted::conflict_prefer("select", "dplyr")
conflicted::conflicts_prefer(dplyr::filter)



epi_annot_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/05_vcf/annotation_total/KO/bin5_tot//APOBEC_epi_merge.A3A.spread.tsv")




metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")

#metadata
cl_df<-read_tsv("/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")
#cl_df
cl_df%>%dplyr::select(samples,imd_cutoff)%>%unique()%>%arrange(imd_cutoff)
files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
                          ".excl.*gd$",
                          full.names=T)
#files_to_read
conflicted::conflicts_prefer(dplyr::select)
vcf_tmp[[1]]
t_df<-read_tsv(files_to_read[1])
t_df%>%colnames()
vcf_tmp<-lapply(files_to_read,function(x){
  read_tsv(x)%>%mutate(id=gsub(".mutect2.*","",basename(x)))%>%
    dplyr::select(id,`#CHROM`,POS,REF,ALT,sig_cont,Func_refGene,Gene_refGene)
  
})
vcf_tmp[[1]]
merge_df<-do.call(rbind,vcf_tmp)
#merge_df<-merge_df%>%filter(grepl("[ATGC][ATGC]>[ATGC][ATGC]",sig_cont))

merge_df<-left_join(merge_df,metadata%>%select(-`m/d`))

cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)
merge_df<-left_join(merge_df,cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)%>%plyr::rename(c("chr"="#CHROM","start"="POS","samples"="id")))
merge_df<-merge_df%>%mutate(IMD=ifelse(is.na(IMD),0,IMD),cluster_id=ifelse(is.na(cluster_id),".",cluster_id),cluster=ifelse(is.na(cluster_type_omikli_upto_3),"non-clust",cluster_type_omikli_upto_3))%>%select(-cluster_type_omikli_upto_3)

epi_annot_df%>%select(`#CHROM`,r1,id,RepliSeq,RNAseq,H3K36me3)

merge_df<-left_join(merge_df%>%select(id,`#CHROM`,POS,REF,ALT,sig_cont,cluster_id,cluster),
                    epi_annot_df%>%select(`#CHROM`,r2,id,RepliSeq,RNAseq,H3K9me3,H3K27me3),
                    by=c("id"="id","#CHROM"="#CHROM","POS"="r2"))
ref_df

ref_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/06_ref/final/combination/bin5/RR927/KO/context/RR927.q5.q5.q5.q5.size.tsv",
                 
                 col_names=c("RepliSeq","RNAseq","H3K9me3","H3K27me3","size"),
                 col_types=cols(.default="c",RepliSeq="c",RNAseq="c",size="c",H3K9me3="c",H3K27me3="c"))
#ref_df<-ref_df%>%group_by(RepliSeq,RNAseq)%>%
#  dplyr::summarise(tot_size=sum(size))%>%
#  plyr::rename(c("tot_size"="size"))
bin6_ref_df<-ref_df
bin6_ref_df$size%>%sum()
ref_df$RepliSeq<-as.character(ref_df$RepliSeq)
ref_df$RNAseq<-as.character(ref_df$RNAseq)

merge_df<-(merge_df%>%filter(grepl("A3A",id))%>%filter(grepl("TP53",id))%>%
             filter(!grepl("Ctrl",id))%>%
             filter(!id%in%c("A3A_1st_C3","A3A_C3_TP53KO_C3")))


merge_df$id%>%unique()
tot_epi_fin_df<-left_join(ref_df,merge_df%>%
                            #filter(id==nm)%>%
                            #filter(grepl("_3",id))%>%
                            filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                            filter(grepl("TC>[TG]",sig_cont))%>%
                            dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,H3K9me3,H3K27me3,sig_cont)%>%
                            unique()%>%
                            group_by(RepliSeq,RNAseq,H3K9me3,H3K27me3)%>%
                            dplyr::summarise(n=n())%>%ungroup()
)
#  tot_epi_fin_df
tot_epi_fin_df[is.na(tot_epi_fin_df)]<-0



tot_epi_fin_df<-tot_epi_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

tot_epi_fin_df<-tot_epi_fin_df%>%plyr::rename(c("n"="count"))

tot_epi_fin_df<-tot_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=6*size)
#tot_epi_fin_df$RepliSeq<-as.character(tot_epi_fin_df$RepliSeq)
#tot_epi_fin_df$RNAseq<-as.character(tot_epi_fin_df$RNAseq)
#tot_epi_fin_df$H3K36me3<-as.character(tot_epi_fin_df$H3K36me3)


tot_epi_fin_df
library(MASS)
#conflicted::conflict_prefer(`dplyr::select`)
tot_fit <- glm.nb(count ~ RepliSeq+RNAseq+H3K9me3+H3K27me3+offset(log(sample_size/6)) , data=tot_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3","RepliSeq4","RepliSeq5",
              "RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5",
              "H3K9me31","H3K9me32","H3K9me33","H3K9me34","H3K9me35",
              "H3K27me31","H3K27me32","H3K27me33","H3K27me34","H3K27me35"
)


t_tot_df<-tibble(bin_name = names(tot_fit $coefficients), coef = tot_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))%>%
  rbind(data.frame(bin_name=c("RepliSeq0","RNAseq0","H3K9me30","H3K27me30"),coef=c(0,0,0,0),log2_coef=c(0,0,0,0)))


new_bin6_RR3D_tot_event_df<-t_tot_df%>%mutate(fold=exp(coef))
new_bin6_RR3D_tot_event_df
#new_bin6_RR3D_tot_event_df%>%filter(grepl("2",bin_name))$fold

new_bin_order<-c("RepliSeq4","RepliSeq3","RepliSeq2","RepliSeq1","RepliSeq0",
                 
                 #"DHS0","DHS1","DHS2","DHS3",
                 #"H3K36me30","H3K36me31","H3K36me32","H3K36me33",
                 "RNAseq0","RNAseq1","RNAseq2","RNAseq3","RNAseq4",
                 "H3K9me30","H3K9me31","H3K9me32","H3K9me33","H3K9me34",
                 "H3K27me30","H3K27me31","H3K27me32","H3K27me33","H3K27me34"
)
new_bin6_RR3D_tot_event_df<-new_bin6_RR3D_tot_event_df%>%
  arrange(bin_name)%>%
  mutate(bin_group=rep(c("H3K27me3","H3K9me3","RNAseq","RepliSeq"),each=5))
new_bin6_RR3D_tot_event_df<-left_join(new_bin6_RR3D_tot_event_df,new_bin6_RR3D_tot_event_df%>%
                                        filter(grepl("2$",bin_name))%>%
                                        plyr::rename(c("fold"="base_fold"))%>%
                                        select(bin_group,base_fold)
)%>%
  mutate(new_fold=fold/base_fold)
p<-new_bin6_RR3D_tot_event_df%>%
  #filter(group%in%c("non-clust","kataegis","omikli"))%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=new_fold,group=bin_group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("Fold change  in A3A mutations")+
  theme(plot.title=element_text(size=20),
        axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=1,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(0,1.5,by=0.5),
                     lim=c(0,1.5))+
  facet_wrap(~bin_group,scales="free_x",ncol=4)+
  #ggtitle("WT_own_bin4")+
  scale_color_manual(values=c("RED","black"))#+
#ggtitle("WT_ori_bin4")+
#ggtitle(nm)
#p
p



ggsave(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/bin5/KO_merged.enrich.pdf"),p,
       width=10,height=5)

write.table(new_bin6_RR3D_tot_event_df,paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/bin5//KO_merged.enrich.txt"),row.names=F,
            quote=F,
            sep="\t")
tot_epi_fin_cont_df<-
  tot_epi_fin_df%>%
  mutate(TP53="KO",bin_type="bin5")%>%mutate(ref="own",sample="organoid",region="total")%>%
  select(-sample_size)#%>%
#mutate(id=nm)
tot_epi_fin_cont_df%>%
  select(sample,ref,bin_type,TP53,RepliSeq,RNAseq,H3K9me3,H3K27me3,count,region)%>%
  write.table(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/bin5/KO_merged.merged.count.tsv"),
              sep="\t",
              quote=F,
              row.names=F)

tot_epi_fin_cont_df%>%gather(seq,bin,RepliSeq:H3K27me3)%>%
  #  select(-sample_size)%>%
  mutate(TP53="WT",bin_type="bin5")%>%mutate(ref="own",sample="organoid",region="excl_intergenic")%>%
  group_by(ref,bin_type,TP53,seq,bin,region)%>%
  dplyr::summarise(tot_size=sum(size)*6*4,
                   tot_count=sum(count))%>%
  mutate(rate=tot_count/tot_size)%>%
  #mutate(id=nm)%>%
  write.table(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/bin5/KO_merged.merged.rate.tsv"),
              sep="\t",
              quote=F,
              row.names=F)

merge_df%>%
  #filter(id==nm)%>%
  #filter(grepl("_3",id)|id=="A3A_1st_C3_100ng-1")%>%
  filter(grepl("TP53",id))%>%filter(!grepl("Ctrl",id))%>%
  filter(grepl("A3A",id))%>%
  filter(id!="A3A_C3_TP53KO_C3")%>%
  select(id)%>%unique()
##merge##
tot_epi_fin_df<-left_join(ref_df,merge_df%>%
                            #filter(id==nm)%>%
                            #filter(grepl("_3",id)|id=="A3A_1st_C3_100ng-1")%>%
                            filter(grepl("TP53",id))%>%filter(!grepl("Ctrl",id))%>%
                            filter(grepl("A3A",id))%>%
                            filter(id!="A3A_C3_TP53KO_C3")%>%
                            filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                            filter(grepl("TC>[TG]",sig_cont))%>%
                            dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,sig_cont)%>%
                            unique()%>%
                            group_by(RepliSeq,RNAseq)%>%
                            dplyr::summarise(n=n())%>%ungroup()
)
#  tot_epi_fin_df
tot_epi_fin_df[is.na(tot_epi_fin_df)]<-0



tot_epi_fin_df<-tot_epi_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

tot_epi_fin_df<-tot_epi_fin_df%>%plyr::rename(c("n"="count"))

tot_epi_fin_df<-tot_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)
#tot_epi_fin_df$RepliSeq<-as.character(tot_epi_fin_df$RepliSeq)
tot_epi_fin_df$RNAseq<-as.character(tot_epi_fin_df$RNAseq)
#tot_epi_fin_df$H3K36me3<-as.character(tot_epi_fin_df$H3K36me3)


tot_epi_fin_df
library(MASS)
#conflicted::conflict_prefer(`dplyr::select`)
tot_fit <- glm.nb(count ~ RepliSeq+RNAseq+offset(log(sample_size/5)) , data=tot_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3","RepliSeq4",
              "RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5"
)


t_tot_df<-tibble(bin_name = names(tot_fit $coefficients), coef = tot_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))%>%
  rbind(data.frame(bin_name=c("RepliSeq0","RNAseq0"),coef=c(0,0),log2_coef=c(0,0)))


new_bin6_RR3D_tot_event_df<-t_tot_df%>%mutate(fold=exp(coef))
new_bin6_RR3D_tot_event_df
#new_bin6_RR3D_tot_event_df%>%filter(grepl("2",bin_name))$fold

new_bin_order<-c("RepliSeq4","RepliSeq3","RepliSeq2","RepliSeq1","RepliSeq0",
                 #"DHS0","DHS1","DHS2","DHS3",
                 #"H3K36me30","H3K36me31","H3K36me32","H3K36me33",
                 "RNAseq0","RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5"
)
new_bin6_RR3D_tot_event_df<-new_bin6_RR3D_tot_event_df%>%
  arrange(bin_name)%>%
  mutate(bin_group=rep(c("RepliSeq","RNAseq"),each=5))
new_bin6_RR3D_tot_event_df<-left_join(new_bin6_RR3D_tot_event_df,new_bin6_RR3D_tot_event_df%>%
                                        filter(grepl("2",bin_name))%>%
                                        plyr::rename(c("fold"="base_fold"))%>%
                                        select(bin_group,base_fold)
)%>%
  mutate(new_fold=fold/base_fold)
nm="KO_merge"
p<-new_bin6_RR3D_tot_event_df%>%
  #filter(group%in%c("non-clust","kataegis","omikli"))%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=new_fold,group=bin_group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("Fold change  in A3A mutations")+
  theme(plot.title=element_text(size=20),
        axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=1,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(0,2,by=0.5),
                     lim=c(0,2))+
  facet_wrap(~bin_group,scales="free_x",ncol=4)+
  #ggtitle("WT_own_bin4")+
  scale_color_manual(values=c("RED","black"))+
  #ggtitle("WT_ori_bin4")+
  ggtitle(nm)
#p
p
ggsave(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/rep_bin5_rna_bin5/",nm,".pdf"),p,
       width=10,height=5)

write.table(new_bin6_RR3D_tot_event_df,paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/rep_bin5_rna_bin5/",nm,".enrich.txt"),row.names=F,
            quote=F,
            sep="\t")
tot_epi_fin_cont_df<-
  tot_epi_fin_df%>%
  mutate(TP53="KO",bin_type="bin6")%>%mutate(ref="own",sample="organoid")%>%
  select(-sample_size)%>%
  mutate(id=nm)
tot_epi_fin_cont_df%>%
  select(sample,ref,bin_type,TP53,RepliSeq,RNAseq,count)%>%
  write.table(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/rep_bin5_rna_bin5/",nm,".tsv"),
              sep="\t",
              quote=F,
              row.names=F)

tot_epi_fin_cont_df%>%gather(seq,bin,RepliSeq:RNAseq)%>%
  #  select(-sample_size)%>%
  mutate(TP53="KO",bin_type="bin6")%>%mutate(ref="own",sample="organoid")%>%
  group_by(sample,ref,bin_type,TP53,seq,bin)%>%
  dplyr::summarise(tot_size=sum(size),
                   tot_count=sum(count))%>%
  mutate(rate=tot_count/tot_size)%>%
  mutate(id=nm)%>%
  write.table(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/rep_bin5_rna_bin5/",nm,".merge.tsv"),
              sep="\t",
              quote=F,
              row.names=F)

####



##non-clust###
tot_epi_fin_df<-left_join(ref_df,merge_df%>%
                            filter(id==nm)%>%
                            filter(cluster=="non-clust")%>%
                            filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                            filter(grepl("TC>T",sig_cont))%>%
                            dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,sig_cont)%>%
                            unique()%>%
                            group_by(RepliSeq,RNAseq)%>%
                            dplyr::summarise(n=n())%>%ungroup()
)
#tot_epi_fin_df
tot_epi_fin_df[is.na(tot_epi_fin_df)]<-0



tot_epi_fin_df<-tot_epi_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

tot_epi_fin_df<-tot_epi_fin_df%>%plyr::rename(c("n"="count"))

tot_epi_fin_df<-tot_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)
tot_epi_fin_df$RepliSeq<-as.character(tot_epi_fin_df$RepliSeq)
tot_epi_fin_df$RNAseq<-as.character(tot_epi_fin_df$RNAseq)



#library(MASS)
#conflicted::conflict_prefer(`dplyr::select`)
tot_fit <- glm.nb(count ~ RepliSeq  + RNAseq + offset(log(sample_size/5)), data=tot_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              #"H3K36me31","H3K36me32","H3K36me33",
              "RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5"
              #"DHS1","DHS2","DHS3"
)


t_tot_df<-tibble(bin_name = names(tot_fit $coefficients), coef = tot_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))%>%
  rbind(data.frame(bin_name=c("RepliSeq0","RNAseq0"),coef=c(0,0),log2_coef=c(0,0)))



tot_G_epi_fin_df<-left_join(ref_df,merge_df%>%
                              #filter(grepl("_3",id))%>%
                              filter(id==nm)%>%
                              filter(cluster=="non-clust")%>%
                              filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                              filter(grepl("TC>G",sig_cont))%>%
                              dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,sig_cont)%>%
                              unique()%>%
                              group_by(RepliSeq,RNAseq)%>%
                              dplyr::summarise(n=n())%>%ungroup()
)
#tot_epi_fin_df
tot_G_epi_fin_df[is.na(tot_G_epi_fin_df)]<-0



tot_G_epi_fin_df<-tot_G_epi_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

tot_G_epi_fin_df<-tot_G_epi_fin_df%>%plyr::rename(c("n"="count"))

tot_G_epi_fin_df<-tot_G_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)
tot_G_epi_fin_df$RepliSeq<-as.character(tot_G_epi_fin_df$RepliSeq)
tot_G_epi_fin_df$RNAseq<-as.character(tot_G_epi_fin_df$RNAseq)



#library(MASS)
#conflicted::conflict_prefer(`dplyr::select`)
tot_G_fit <- glm.nb(count ~ RepliSeq + RNAseq +offset(log(sample_size/5)), data=tot_G_epi_fin_df)


t_tot_G_df<-tibble(bin_name = names(tot_G_fit $coefficients), coef = tot_G_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef))) %>%
  rbind(data.frame(bin_name=c("RepliSeq0","RNAseq0"),coef=c(0,0),log2_coef=c(0,0)))

tot_event_df<-rbind(
  #omi_event_t_df%>%mutate(group="omikli"),
  #kat_event_t_df%>%mutate(group="kataegis"),
  #t_df%>%mutate(group="non-clust"),
  t_tot_df%>%mutate(group="CtoT"),
  t_tot_G_df%>%mutate(group="CtoG")
)
#tot_event_df%>%print(n=100)
tot_event_df<-tot_event_df%>%arrange(group,bin_name)%>%mutate(bin_group=rep(c(rep(c("RepliSeq"),4),rep(c("RNAseq"),6)),2))
#tot_event_df<-tot_event_df%>%mutate(bin_group=rep(rep(c("RepliSeq","H3K36me3","RNAseq"),each=3),5))
tot_event_df$bin_group<-factor(tot_event_df$bin_group,levels=c("RepliSeq","RNAseq"))
tot_event_df$group<-factor(tot_event_df$group,levels=c("CtoT","CtoG"))


zero_event_df<-data.frame(bin_name=rep(c("RepliSeq0","RNAseq0"),2),
                          coef=c(0),
                          log2_coef=c(0),
                          group=rep(c("CtoT","CtoG"),each=4),
                          bin_group=rep(c("RepliSeq","RNAseq"),2)
)%>%as.tibble()
zero_event_df
tot_event_df<-rbind(tot_event_df,zero_event_df)
new_bin6_RR3D_tot_event_df<-tot_event_df%>%mutate(fold=exp(coef))
new_bin6_RR3D_tot_event_df

new_bin_order<-c("RepliSeq0","RepliSeq1","RepliSeq2","RepliSeq3",
                 #"DHS0","DHS1","DHS2","DHS3",
                 #"H3K36me30","H3K36me31","H3K36me32","H3K36me33",
                 "RNAseq0","RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5"
)

p<-new_bin6_RR3D_tot_event_df%>%
  #filter(group%in%c("non-clust","kataegis","omikli"))%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=fold,col=group,group=group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("Fold change  in A3A mutations")+
  theme(plot.title=element_text(size=20),
        axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=1,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(0,2,by=0.5),
                     lim=c(0,2))+
  facet_wrap(~bin_group,scales="free_x",ncol=4)+
  #ggtitle("WT_own_bin4")+
  scale_color_manual(values=c("RED","black"))+
  #ggtitle("WT_ori_bin4")+
  ggtitle(paste0(nm,"_non_clust"))
p

ggsave(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/RR/",nm,".non_clust.pdf"),p,
       width=10,height=5)
write.table(new_bin6_RR3D_tot_event_df,paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/RR/",nm,".enrich.non_clust.txt"),row.names=F,
            quote=F,
            sep="\t")

tot_epi_fin_cont_df<-
  rbind(tot_epi_fin_df%>%mutate(mut_type="C>T"),tot_G_epi_fin_df%>%mutate(mut_type="C>G"))%>%
  mutate(TP53="WT",bin_type="bin6")%>%mutate(ref="own",sample="organoid")%>%
  select(-sample_size)%>%
  mutate(id=nm)
tot_epi_fin_cont_df%>%
  select(sample,ref,bin_type,TP53,mut_type,RepliSeq,RNAseq,count)%>%
  write.table(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/RR/",nm,".non_clust.tsv"),
              sep="\t",
              quote=F,
              row.names=F)

tot_epi_fin_cont_df%>%gather(seq,bin,RepliSeq:RNAseq)%>%
  #  select(-sample_size)%>%
  mutate(TP53="WT",bin_type="bin6")%>%mutate(ref="own",sample="organoid")%>%
  group_by(sample,ref,bin_type,TP53,mut_type,seq,bin)%>%
  dplyr::summarise(tot_size=sum(size),
                   tot_count=sum(count))%>%
  mutate(rate=tot_count/tot_size/6)%>%
  mutate(id=nm)%>%
  write.table(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/RR/",nm,".non_clust.merge.tsv"),
              sep="\t",
              quote=F,
              row.names=F)



}






+#####KO######
############
tot_epi_fin_df<-left_join(ref_df,merge_df%>%
                            filter(grepl("_3",id)|grepl("_100",id))%>%
                            filter(grepl("TP53",id))%>%
                            filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                            filter(grepl("TC>",sig_cont))%>%
                            filter(grepl("A3A",id))%>%
                            #filter(cluster=="non-clust")%>%
                            dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,H3K36me3,sig_cont)%>%
                            unique()%>%
                            group_by(RepliSeq,RNAseq,H3K36me3)%>%
                            dplyr::summarise(n=n())%>%ungroup()
)
tot_epi_fin_df
tot_epi_fin_df[is.na(tot_epi_fin_df)]<-0



tot_epi_fin_df<-tot_epi_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

tot_epi_fin_df<-tot_epi_fin_df%>%plyr::rename(c("n"="count"))

tot_epi_fin_df<-tot_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)
tot_epi_fin_df$RepliSeq<-as.character(tot_epi_fin_df$RepliSeq)
tot_epi_fin_df$RNAseq<-as.character(tot_epi_fin_df$RNAseq)
tot_epi_fin_df$H3K36me3<-as.character(tot_epi_fin_df$H3K36me3)



tot_epi_fin_df%>%select(-RepliSeq,-H3K36me3)%>%
  group_by(RNAseq)%>%
  dplyr::summarise(tot_size=sum(size),
                   tot_count=sum(count),
  )%>%
  mutate(rate=tot_count/tot_size)




library(MASS)
conflicted::conflict_prefer(`dplyr::select`)
tot_fit <- glm.nb(count ~ RepliSeq + H3K36me3 + RNAseq + offset(log(sample_size/5)), data=tot_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              "H3K36me31","H3K36me32","H3K36me33","RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5")


t_tot_df<-tibble(bin_name = names(tot_fit $coefficients), coef = tot_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef))) %>%
  rbind(data.frame(bin_name=c("RepliSeq0","H3K36me30","RNAseq0"),coef=c(0,0,0),log2_coef=c(0,0,0)))

new_bin_order = c("RepliSeq0","RepliSeq1","RepliSeq2","RepliSeq3",
                  "H3K36me30","H3K36me31","H3K36me32","H3K36me33",
                  "RNAseq0","RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5")

t_tot_df%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef))+
  geom_point()+
  theme_classic()+
  ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in non-clust A3A mutation])")+
  theme(axis.title=element_text(size=16),
        axis.text.y=element_text(size=16), axis.text.x=element_text(size=16,angle=45,hjust=1),
        plot.title=element_text(size=20))+
  geom_hline(yintercept=0,color="red")#+
#scale_y_continuous(breaks = seq(-1, 1, by = 0.5),
#                   labels=seq(-1,1,by=0.5),
#                   lim=c(-1,1.2))

##tot_T##

merge_df%>%
  filter(grepl("_3",id))%>%
  filter(!grepl("TP53",id))%>%
  filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
  filter(grepl("TC>T",sig_cont))%>%select(id)%>%unique()


tot_epi_fin_df<-left_join(ref_df,merge_df%>%
                            filter(grepl("_3",id)|grepl("_100",id))%>%
                            filter(grepl("TP53",id))%>%
                            filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                            filter(grepl("TC>T",sig_cont))%>%
                            filter(grepl("A3A",id))%>%
                            dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,H3K36me3,sig_cont)%>%
                            unique()%>%
                            group_by(RepliSeq,RNAseq,H3K36me3)%>%
                            dplyr::summarise(n=n())%>%ungroup()
)
tot_epi_fin_df
tot_epi_fin_df[is.na(tot_epi_fin_df)]<-0



tot_epi_fin_df<-tot_epi_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

tot_epi_fin_df<-tot_epi_fin_df%>%plyr::rename(c("n"="count"))

tot_epi_fin_df<-tot_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)
tot_epi_fin_df$RepliSeq<-as.character(tot_epi_fin_df$RepliSeq)
tot_epi_fin_df$RNAseq<-as.character(tot_epi_fin_df$RNAseq)
tot_epi_fin_df$H3K36me3<-as.character(tot_epi_fin_df$H3K36me3)



library(MASS)
#conflicted::conflict_prefer(`dplyr::select`)
tot_fit <- glm.nb(count ~ RepliSeq + H3K36me3 + RNAseq + offset(log(sample_size/5)), data=tot_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              "H3K36me31","H3K36me32","H3K36me33","RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5")


t_tot_df<-tibble(bin_name = names(tot_fit $coefficients), coef = tot_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))%>%
  rbind(data.frame(bin_name=c("RepliSeq0","H3K36me30","RNAseq0"),coef=c(0,0,0),log2_coef=c(0,0,0)))

t_tot_df%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef))+
  geom_point()+
  theme_classic()+
  ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in non-clust A3A mutation])")+
  theme(axis.title=element_text(size=16),
        axis.text.y=element_text(size=16), axis.text.x=element_text(size=16,angle=45,hjust=1),
        plot.title=element_text(size=20))+
  geom_hline(yintercept=0,color="red")+
  scale_y_continuous(breaks = seq(-4, 8, by = 1))


tot_G_epi_fin_df<-left_join(ref_df,merge_df%>%
                              filter(grepl("_3",id)|grepl("_100",id))%>%
                              filter(grepl("TP53",id))%>%
                              filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                              filter(grepl("TC>G",sig_cont))%>%
                              filter(grepl("A3A",id))%>%
                              dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,H3K36me3,sig_cont)%>%
                              unique()%>%
                              group_by(RepliSeq,RNAseq,H3K36me3)%>%
                              dplyr::summarise(n=n())%>%ungroup()
)
#tot_epi_fin_df
tot_G_epi_fin_df[is.na(tot_G_epi_fin_df)]<-0



tot_G_epi_fin_df<-tot_G_epi_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

tot_G_epi_fin_df<-tot_G_epi_fin_df%>%plyr::rename(c("n"="count"))

tot_G_epi_fin_df<-tot_G_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)
tot_G_epi_fin_df$RepliSeq<-as.character(tot_G_epi_fin_df$RepliSeq)
tot_G_epi_fin_df$RNAseq<-as.character(tot_G_epi_fin_df$RNAseq)
tot_G_epi_fin_df$H3K36me3<-as.character(tot_G_epi_fin_df$H3K36me3)



library(MASS)
conflicted::conflict_prefer(`dplyr::select`)
tot_G_fit <- glm.nb(count ~ RepliSeq + H3K36me3 + RNAseq + offset(log(sample_size/5)), data=tot_G_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              "H3K36me31","H3K36me32","H3K36me33","RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5")


t_tot_G_df<-tibble(bin_name = names(tot_G_fit $coefficients), coef = tot_G_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef))) %>%
  rbind(data.frame(bin_name=c("RepliSeq0","H3K36me30","RNAseq0"),coef=c(0,0,0),log2_coef=c(0,0,0)))

t_tot_G_df%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef))+
  geom_point()+
  theme_classic()+
  ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in non-clust A3A mutation])")+
  theme(axis.title=element_text(size=16),
        axis.text.y=element_text(size=16), axis.text.x=element_text(size=16,angle=45,hjust=1),
        plot.title=element_text(size=20))+
  geom_hline(yintercept=0,color="red")+
  scale_y_continuous(breaks = seq(-4, 8, by = 1))





tot_event_df<-rbind(
  #omi_event_t_df%>%mutate(group="omikli"),
  #kat_event_t_df%>%mutate(group="kataegis"),
  #t_df%>%mutate(group="non-clust"),
  t_tot_df%>%mutate(group="CtoT"),
  t_tot_G_df%>%mutate(group="CtoG")
)
tot_event_df$group%>%unique()
tot_event_df<-tot_event_df%>%arrange(group,bin_name)%>%mutate(bin_group=rep(c(rep(c("H3K36me3"),4),rep(c("RepliSeq"),4),rep(c("RNAseq"),6)),2))
#tot_event_df<-tot_event_df%>%mutate(bin_group=rep(rep(c("RepliSeq","H3K36me3","RNAseq"),each=3),5))
tot_event_df$bin_group<-factor(tot_event_df$bin_group,levels=c("RepliSeq","H3K36me3","RNAseq"))
tot_event_df$group<-factor(tot_event_df$group,levels=c("CtoT","CtoG"))


zero_event_df<-data.frame(bin_name=rep(c("RepliSeq0","H3K36me30","RNAseq0"),2),
                          coef=c(1),
                          log2_coef=c(0),
                          group=rep(c("CtoT","CtoG"),each=3),
                          bin_group=rep(c("RepliSeq","H3K36me3","RNAseq"),2)
)%>%as.tibble()
zero_event_df
tot_event_df<-rbind(tot_event_df,zero_event_df)
bin6_ko_tot_event_df<-tot_event_df

p1_bin6_ko<-bin6_ko_tot_event_df%>%
  #filter(group%in%c("non-clust","kataegis","omikli"))%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef,col=group,group=group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in A3A mutations])")+
  theme(axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=0,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(-1, 1, by = 0.5),
                     lim=c(-1,1))+
  facet_wrap(~bin_group,scales="free_x")+
  ggtitle("KO_own_bin6")
#############
p1_bin6
tot_event_df$group
p2<-tot_event_df%>%
  filter(group%in%c("non-clust_CtoT","non-clust_CtoG"))%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef,col=group,group=group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in A3A mutations])")+
  theme(axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=0,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(-1, 1, by = 0.5),
                     labels = seq(-1, 1, by = 0.5),
                     lim=c(-1,1)
  )+
  facet_wrap(~bin_group,scales="free_x")+
  scale_colour_manual(values=c("Orange","Purple"))


#p1
p2



##non_clust##
merge_df
noncl_epi_df<-merge_df%>%
  filter(cluster=="non-clust")%>%
  filter(RepliSeq!="."&RNAseq!="."&"H3K36me3"!=".")%>%
  filter(grepl("TC>",sig_cont))
#noncl_epi_df$RepliSeq<-as.double(noncl_epi_df$RepliSeq)
#noncl_epi_df$H3K36me3<-as.double(noncl_epi_df$H3K36me3)

noncl_epi_df%>%
  filter(grepl("_3",id))%>%
  filter(!grepl("TP53",id))%>%
  dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,H3K36me3,sig_cont)%>%unique()


A3A_3_df%>%filter(clustered=="non-clustered")


noncl_epi_df%>%
  dplyr::select(id,cluster,RepliSeq,RNAseq,H3K36me3,sig_cont)%>%unique()%>%
  filter(grepl("_3",id))%>%
  filter(!grepl("TP53",id))%>%
  filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))



ref_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/06_ref/final/combination/RR3.q4.q4.q4.size.tsv",
                 col_names=c("RepliSeq","RNAseq","H3K36me3","size"),
                 col_types=cols(.default="c",RepliSeq="c",RNAseq="c",H3K36me3="c",size="d"))

ref_noncl_df<-ref_df%>%mutate(cluster="non-clust")
#)%>%as.tibble()


noncl_epi_fin_df<-left_join(ref_noncl_df,noncl_epi_df%>%
                              filter(grepl("_3",id))%>%
                              filter(!grepl("TP53",id))%>%
                              filter(grepl("TC>G",sig_cont)|grepl("TC>T",sig_cont))%>%
                              dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,H3K36me3,sig_cont)%>%
                              group_by(cluster,RepliSeq,RNAseq,H3K36me3)%>%
                              dplyr::summarise(n=n())%>%ungroup()
)



noncl_epi_fin_df<-noncl_epi_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

noncl_epi_fin_df<-noncl_epi_fin_df%>%plyr::rename(c("n"="count"))

noncl_epi_fin_df<-noncl_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)
noncl_epi_fin_df$RepliSeq<-as.character(noncl_epi_fin_df$RepliSeq)
noncl_epi_fin_df$RNAseq<-as.character(noncl_epi_fin_df$RNAseq)
noncl_epi_fin_df$H3K36me3<-as.character(noncl_epi_fin_df$H3K36me3)
library(MASS)
#conflicted::conflict_prefer("select", "dplyr")
conflicted::conflicts_prefer(dplyr::filter)
#conflicted::conflicts_prefer(MASS::glm.nb)
noncl_fit <- glm.nb(count ~ RepliSeq + H3K36me3 + RNAseq + offset(log(sample_size/5)), data=noncl_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              "H3K36me31","H3K36me32","H3K36me33","RNAseq1","RNAseq2","RNAseq3")


t_df<-tibble(bin_name = names(noncl_fit $coefficients), coef = noncl_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))
t_df%>%
  ggplot(aes(x=factor(bin_name,levels=bin_order),y=log2_coef))+
  geom_point()+
  theme_classic()+
  ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in non-clust A3A mutation])")+
  theme(axis.title=element_text(size=16),
        axis.text.y=element_text(size=16), axis.text.x=element_text(size=16,angle=45,hjust=1),
        plot.title=element_text(size=20))+
  geom_hline(yintercept=0,color="red")+
  scale_y_continuous(breaks = seq(-4, 8, by = 1))



##noncl CtoT

noncl_epi_T_fin_df<-left_join(ref_noncl_df,noncl_epi_df%>%
                                filter(grepl("_3",id))%>%
                                filter(!grepl("TP53",id))%>%
                                filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                                dplyr::select(`#CHROM`,POS,REF,POS,cluster,RepliSeq,RNAseq,H3K36me3,sig_cont)%>%
                                filter(grepl("TC>T",sig_cont))%>%
                                group_by(cluster,RepliSeq,RNAseq,H3K36me3)%>%
                                dplyr::summarise(n=n())%>%ungroup()
)



noncl_epi_T_fin_df<-noncl_epi_T_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

noncl_epi_T_fin_df<-noncl_epi_T_fin_df%>%plyr::rename(c("n"="count"))

noncl_epi_T_fin_df<-noncl_epi_T_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)
noncl_epi_T_fin_df$RepliSeq<-as.character(noncl_epi_T_fin_df$RepliSeq)
noncl_epi_T_fin_df$RNAseq<-as.character(noncl_epi_T_fin_df$RNAseq)
noncl_epi_T_fin_df$H3K36me3<-as.character(noncl_epi_T_fin_df$H3K36me3)
library(MASS)
#conflicted::conflict_prefer("select", "dplyr")
conflicted::conflicts_prefer(dplyr::filter)
#conflicted::conflicts_prefer(MASS::glm.nb)
noncl_T_fit <- glm.nb(count ~ RepliSeq + H3K36me3 + RNAseq + offset(log(sample_size/5)), data=noncl_epi_T_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              "H3K36me31","H3K36me32","H3K36me33","RNAseq1","RNAseq2","RNAseq3")


t_T_df<-tibble(bin_name = names(noncl_T_fit $coefficients), coef = noncl_T_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))
t_T_df%>%
  ggplot(aes(x=factor(bin_name,levels=bin_order),y=log2_coef))+
  geom_point()+
  theme_classic()+
  ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in non-clust A3A mutation])")+
  theme(axis.title=element_text(size=16),
        axis.text.y=element_text(size=16), axis.text.x=element_text(size=16,angle=45,hjust=1),
        plot.title=element_text(size=20))+
  geom_hline(yintercept=0,color="red")+
  scale_y_continuous(breaks = seq(-4, 8, by = 1))



#non-cl_ CtoG

noncl_epi_G_fin_df<-left_join(ref_noncl_df,noncl_epi_df%>%
                                filter(grepl("_3",id))%>%
                                filter(!grepl("TP53",id))%>%
                                filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                                dplyr::select(`#CHROM`,POS,REF,POS,cluster,RepliSeq,RNAseq,H3K36me3,sig_cont)%>%
                                filter(grepl("TC>G",sig_cont))%>%
                                group_by(cluster,RepliSeq,RNAseq,H3K36me3)%>%
                                dplyr::summarise(n=n())%>%ungroup()
)



noncl_epi_G_fin_df<-noncl_epi_G_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

noncl_epi_G_fin_df<-noncl_epi_G_fin_df%>%plyr::rename(c("n"="count"))

noncl_epi_G_fin_df<-noncl_epi_G_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)
noncl_epi_G_fin_df$RepliSeq<-as.character(noncl_epi_G_fin_df$RepliSeq)
noncl_epi_G_fin_df$RNAseq<-as.character(noncl_epi_G_fin_df$RNAseq)
noncl_epi_G_fin_df$H3K36me3<-as.character(noncl_epi_G_fin_df$H3K36me3)
#conflicted::conflicts_prefer(MASS::glm.nb)
noncl_G_fit <- glm.nb(count ~ RepliSeq + H3K36me3 + RNAseq + offset(log(sample_size/5)), data=noncl_epi_G_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              "H3K36me31","H3K36me32","H3K36me33","RNAseq1","RNAseq2","RNAseq3")


t_G_df<-tibble(bin_name = names(noncl_G_fit $coefficients), coef = noncl_G_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))
t_G_df%>%
  ggplot(aes(x=factor(bin_name,levels=bin_order),y=log2_coef))+
  geom_point()+
  theme_classic()+
  ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in non-clust A3A mutation])")+
  theme(axis.title=element_text(size=16),
        axis.text.y=element_text(size=16), axis.text.x=element_text(size=16,angle=45,hjust=1),
        plot.title=element_text(size=20))+
  geom_hline(yintercept=0,color="red")+
  scale_y_continuous(breaks = seq(-4, 8, by = 1))


##kataegis event##

excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/exclude_cl_mut_df.txt")

overlap_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/overlapped_cluster.txt")
##overlap_ex
miss_phs_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/miss_phased_cluster.txt")
miss_phs_excl_df<-miss_phs_excl_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))



kat_event_epi_df<-
  merge_df%>%filter(cluster=="kataegis")%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))%>%
  filter(info%in%A3A_cl_merge_f_excl_df$info)%>%
  #  filter(grepl("_3",id))%>%
  #filter(!grepl("TP53",id))%>%
  filter(!info%in%excl_df$info)%>%
  filter(!info%in%overlap_excl_df$info)%>%
  filter(!info%in%miss_phs_excl_df$info)%>%
  filter(RepliSeq!=".")%>%
  filter(RNAseq!=".")%>%
  filter(H3K36me3!=".")%>%
  mutate(RepliSeq=as.double(RepliSeq),
         H3K36me3=as.double(H3K36me3),
         RNAseq=as.double(RNAseq)
  )%>%
  group_by(id,`#CHROM`,cluster_id)%>%
  dplyr::summarise(median_RepliSeq=round(median(RepliSeq)),
                   median_RNAseq=round(median(RNAseq)),
                   median_H3K36me3=round(median(H3K36me3)))%>%
  plyr::rename(c("median_RepliSeq"="RepliSeq","median_RNAseq"="RNAseq","median_H3K36me3"="H3K36me3"))%>%
  group_by(RepliSeq,RNAseq,H3K36me3)%>%
  dplyr::summarise(n=n())
kat_event_epi_df$RepliSeq<-as.character(kat_event_epi_df$RepliSeq)
kat_event_epi_df$RNAseq<-as.character(kat_event_epi_df$RNAseq)
kat_event_epi_df$H3K36me3<-as.character(kat_event_epi_df$H3K36me3)
ref_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/06_ref/final/combination/RR3.q4.q4.q4.size.tsv",
                 col_names=c("RepliSeq","RNAseq","H3K36me3","size"),
                 col_types=cols(.default="c",RepliSeq="c",RNAseq="c",H3K36me3="c",size="d"))

kat_event_epi_fin_df<-left_join(ref_df,kat_event_epi_df)
kat_event_epi_fin_df[is.na(kat_event_epi_fin_df)]<-0
kat_event_epi_fin_df<-kat_event_epi_fin_df%>%plyr::rename(c("n"="count"))

kat_event_epi_fin_df<-kat_event_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)


kat_event_fit <- glm.nb(count ~ RepliSeq + H3K36me3 + RNAseq + offset(log(sample_size/5)), data=kat_event_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              "H3K36me31","H3K36me32","H3K36me33","RNAseq1","RNAseq2","RNAseq3")


kat_event_t_df<-tibble(bin_name = names(kat_event_fit $coefficients), coef = kat_event_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))
kat_event_t_df%>%
  ggplot(aes(x=factor(bin_name,levels=bin_order),y=log2_coef))+
  geom_point()+
  theme_classic()+
  ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in non-clust A3A mutation])")+
  theme(axis.title=element_text(size=16),
        axis.text.y=element_text(size=16), axis.text.x=element_text(size=16,angle=45,hjust=1),
        plot.title=element_text(size=20))+
  geom_hline(yintercept=0,color="red")+
  scale_y_continuous(breaks = seq(-4, 8, by = 1))

##kataegis each##


kat_epi_df<-merge_df%>%filter(cluster=="kataegis")%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))%>%
  filter(info%in%A3A_cl_merge_f_excl_df$info)%>%
  filter(grepl("_3",id))%>%
  filter(!grepl("TP53",id))%>%
  filter(!info%in%excl_df$info)%>%
  filter(!info%in%overlap_excl_df$info)%>%
  filter(!info%in%miss_phs_excl_df$info)%>%
  filter(RepliSeq!=".")%>%
  filter(RNAseq!=".")%>%
  filter(H3K36me3!=".")%>%
  group_by(RepliSeq,RNAseq,H3K36me3)%>%
  dplyr::summarise(n=n())
kat_epi_df$RepliSeq<-as.character(kat_epi_df$RepliSeq)
kat_epi_df$RNAseq<-as.character(kat_epi_df$RNAseq)

ref_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/06_ref/final/combination/RR3.q4.q4.q4.size.tsv",
                 col_names=c("RepliSeq","RNAseq","H3K36me3","size"),
                 col_types=cols(.default="c",RepliSeq="c",RNAseq="c",H3K36me3="c",size="d"))

kat_epi_fin_df<-left_join(ref_df,kat_epi_df)
kat_epi_fin_df[is.na(kat_epi_fin_df)]<-0
kat_epi_fin_df<-kat_epi_fin_df%>%plyr::rename(c("n"="count"))

kat_epi_fin_df<-kat_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)


kat_fit <- glm.nb(count ~ RepliSeq + H3K36me3 + RNAseq + offset(log(sample_size/5)), data=kat_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              "H3K36me31","H3K36me32","H3K36me33","RNAseq1","RNAseq2","RNAseq3")


kat_t_df<-tibble(bin_name = names(kat_fit $coefficients), coef = kat_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))
kat_t_df%>%
  ggplot(aes(x=factor(bin_name,levels=bin_order),y=log2_coef))+
  geom_point()+
  theme_classic()+
  ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in non-clust A3A mutation])")+
  theme(axis.title=element_text(size=16),
        axis.text.y=element_text(size=16), axis.text.x=element_text(size=16,angle=45,hjust=1),
        plot.title=element_text(size=20))+
  geom_hline(yintercept=0,color="red")+
  scale_y_continuous(breaks = seq(-4, 8, by = 1))


##omikli each###

excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/exclude_cl_mut_df.txt")

overlap_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/overlapped_cluster.txt")
##overlap_ex
miss_phs_excl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/miss_phased_cluster.txt")
miss_phs_excl_df<-miss_phs_excl_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))



omi_epi_df<-merge_df%>%filter(cluster=="omikli")%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))%>%
  filter(info%in%A3A_cl_merge_f_excl_df$info)%>%
  filter(grepl("_3",id))%>%
  filter(!grepl("TP53",id))%>%
  filter(!info%in%excl_df$info)%>%
  filter(!info%in%overlap_excl_df$info)%>%
  filter(!info%in%miss_phs_excl_df$info)%>%
  filter(RepliSeq!=".")%>%
  filter(RNAseq!=".")%>%
  filter(H3K36me3!=".")%>%
  group_by(RepliSeq,RNAseq,H3K36me3)%>%
  dplyr::summarise(n=n())
omi_epi_df$RepliSeq<-as.character(omi_epi_df$RepliSeq)
omi_epi_df$RNAseq<-as.character(omi_epi_df$RNAseq)

ref_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/06_ref/final/combination/RR3.q4.q4.q4.size.tsv",
                 col_names=c("RepliSeq","RNAseq","H3K36me3","size"),
                 col_types=cols(.default="c",RepliSeq="c",RNAseq="c",H3K36me3="c",size="d"))

omi_epi_fin_df<-left_join(ref_df,omi_epi_df)
omi_epi_fin_df[is.na(omi_epi_fin_df)]<-0
omi_epi_fin_df<-omi_epi_fin_df%>%plyr::rename(c("n"="count"))

omi_epi_fin_df<-omi_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)


omi_fit <- glm.nb(count ~ RepliSeq + H3K36me3 + RNAseq + offset(log(sample_size/5)), data=omi_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              "H3K36me31","H3K36me32","H3K36me33","RNAseq1","RNAseq2","RNAseq3")


omi_t_df<-tibble(bin_name = names(omi_fit $coefficients), coef = omi_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))
omi_t_df%>%
  ggplot(aes(x=factor(bin_name,levels=bin_order),y=log2_coef))+
  geom_point()+
  theme_classic()+
  ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in non-clust A3A mutation])")+
  theme(axis.title=element_text(size=16),
        axis.text.y=element_text(size=16), axis.text.x=element_text(size=16,angle=45,hjust=1),
        plot.title=element_text(size=20))+
  geom_hline(yintercept=0,color="red")+
  scale_y_continuous(breaks = seq(-4, 8, by = 1))


##omikli event##

omi_event_epi_df$n%>%sum()

omi_event_epi_df<-
  merge_df%>%filter(cluster=="omikli")%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))%>%
  filter(info%in%A3A_cl_merge_f_excl_df$info)%>%
  filter(grepl("_3",id))%>%
  filter(!grepl("TP53",id))%>%
  filter(!info%in%excl_df$info)%>%
  filter(!info%in%overlap_excl_df$info)%>%
  filter(!info%in%miss_phs_excl_df$info)%>%
  filter(RepliSeq!=".")%>%
  filter(RNAseq!=".")%>%
  filter(H3K36me3!=".")%>%
  mutate(RepliSeq=as.double(RepliSeq),
         H3K36me3=as.double(H3K36me3),
         RNAseq=as.double(RNAseq)
  )%>%
  group_by(id,`#CHROM`,cluster_id)%>%
  dplyr::summarise(median_RepliSeq=round(median(RepliSeq)),
                   median_RNAseq=round(median(RNAseq)),
                   median_H3K36me3=round(median(H3K36me3)))%>%
  plyr::rename(c("median_RepliSeq"="RepliSeq","median_RNAseq"="RNAseq","median_H3K36me3"="H3K36me3"))%>%
  group_by(RepliSeq,RNAseq,H3K36me3)%>%
  dplyr::summarise(n=n())
omi_event_epi_df$RepliSeq<-as.character(omi_event_epi_df$RepliSeq)
omi_event_epi_df$RNAseq<-as.character(omi_event_epi_df$RNAseq)
omi_event_epi_df$H3K36me3<-as.character(omi_event_epi_df$H3K36me3)
ref_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/06_ref/final/combination/RR3.q4.q4.q4.size.tsv",
                 col_names=c("RepliSeq","RNAseq","H3K36me3","size"),
                 col_types=cols(.default="c",RepliSeq="c",RNAseq="c",H3K36me3="c",size="d"))

omi_event_epi_fin_df<-left_join(ref_df,omi_event_epi_df)
omi_event_epi_fin_df[is.na(omi_event_epi_fin_df)]<-0
omi_event_epi_fin_df<-omi_event_epi_fin_df%>%plyr::rename(c("n"="count"))

omi_event_epi_fin_df<-omi_event_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)


omi_event_fit <- glm.nb(count ~ RepliSeq + H3K36me3 + RNAseq + offset(log(sample_size/5)), data=omi_event_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3",
              "H3K36me31","H3K36me32","H3K36me33","RNAseq1","RNAseq2","RNAseq3")


omi_event_t_df<-tibble(bin_name = names(omi_event_fit $coefficients), coef = omi_event_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))
omi_event_t_df%>%
  ggplot(aes(x=factor(bin_name,levels=bin_order),y=log2_coef))+
  geom_point()+
  theme_classic()+
  ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in non-clust A3A mutation])")+
  theme(axis.title=element_text(size=16),
        axis.text.y=element_text(size=16), axis.text.x=element_text(size=16,angle=45,hjust=1),
        plot.title=element_text(size=20))+
  geom_hline(yintercept=0,color="red")+
  scale_y_continuous(breaks = seq(-4, 8, by = 1))


tot_event_df<-rbind(
  omi_event_t_df%>%mutate(group="omikli"),
  kat_event_t_df%>%mutate(group="kataegis"),
  t_df%>%mutate(group="non-clust"),
  t_T_df%>%mutate(group="non-clust_CtoT"),
  t_G_df%>%mutate(group="non-clust_CtoG")
)
tot_event_df$group%>%unique()
tot_event_df<-tot_event_df%>%mutate(bin_group=rep(rep(c("RepliSeq","H3K36me3","RNAseq"),each=3),5))
tot_event_df$bin_group<-factor(tot_event_df$bin_group,levels=c("RepliSeq","H3K36me3","RNAseq"))
tot_event_df$group<-factor(tot_event_df$group,levels=c("non-clust","omikli","kataegis","non-clust_CtoT","non-clust_CtoG"))


zero_event_df<-data.frame(bin_name=rep(c("RepliSeq0","H3K36me30","RNAseq0"),5),
                          coef=c(1),
                          log2_coef=c(0),
                          group=rep(c("omikli","kataegis","non-clust","non-clust_CtoT","non-clust_CtoG"),each=3),
                          bin_group=rep(c("RepliSeq","H3K36me3","RNAseq"),5)
)%>%as.tibble()
zero_event_df
tot_event_df<-rbind(tot_event_df,zero_event_df)
p1<-tot_event_df%>%
  filter(group%in%c("non-clust","kataegis","omikli"))%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef,col=group,group=group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in A3A mutations])")+
  theme(axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=0,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(-2, 8, by = 1),
                     lim=c(-2.1,4))+
  facet_wrap(~bin_group,scales="free_x")

p1
p1
tot_event_df$group
p2<-tot_event_df%>%
  filter(group%in%c("non-clust_CtoT","non-clust_CtoG"))%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef,col=group,group=group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in A3A mutations])")+
  theme(axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=0,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(-1, 1, by = 0.5),
                     labels = seq(-1, 1, by = 0.5),
                     lim=c(-1,1)
  )+
  facet_wrap(~bin_group,scales="free_x")+
  scale_colour_manual(values=c("Orange","Purple"))


#p1
p2
#ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/enrichment_cluster.v2.pdf",p1,
#       width=10,height=5)

#ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/enrichment_noncluster.v2.pdf",p2,
#       width=10,height=5)

t_total_df<-rbind(
  t_tot_df,
  t_tot_G_df,
  t_tot_df
)
t_total_df<-t_total_df%>%mutate(bin_group=gsub("[0-9]$","",bin_name))

t_total_df$bin_group<-factor(t_total_df$bin_group,levels=c("RepliSeq","H3K36me3","RNAseq"))
t_total_df<-t_total_df%>%mutate(group=rep(c("total","total_T","total_G"),each=12))
#t_tot_df<-t_tot_df%>%mutate(group="total")
p3<-t_total_df%>%
  filter(group=="total")%>%
  #ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef,group=group,col=))+
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef,col=group,group=group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in A3A mutations])")+
  theme(axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=0,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(-1, 1, by = 0.5),
                     labels = seq(-1, 1, by = 0.5),
                     lim=c(-1,1)
  )+
  facet_wrap(~bin_group,scales="free_x")
p3
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/enrichment_total.v2.pdf",p3,
       width=10,height=5)


p4<-t_total_df%>%
  filter(group!="total")%>%
  #ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef,group=group,col=))+
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2_coef,col=group,group=group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("log2([enrichment in A3A mutations])")+
  theme(axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=0,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(-1, 1, by = 0.5),
                     labels = seq(-1, 1, by = 0.5),
                     lim=c(-1,1)
  )+
  facet_wrap(~bin_group,scales="free_x")+
  scale_colour_manual(values=c("red","black"))
p4
ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/enrichment_total.CG.v2.pdf",p4,
       width=10,height=5)












p2



noncl_sum_df<-bind_cols(tibble(bin_name = rownames(summary(noncl_fit)$coef)),
                        summary(noncl_fit)$coef %>% as.tibble()) %>%
  filter(bin_name != "(Intercept)") %>%
  rename(estimate = Estimate, std_error = `Std. Error`) %>%
  dplyr::select(bin_name,estimate,std_error) %>%
  mutate(lower_CI = estimate - 1.96*std_error) %>%
  mutate(upper_CI = estimate + 1.96*std_error) %>%
  mutate(fold = ifelse(estimate > 0, exp(estimate), 1/exp(estimate))) %>%
  mutate(lower_fold = ifelse(estimate > 0, exp(lower_CI), 1/exp(lower_CI))) %>%
  mutate(upper_fold = ifelse(estimate > 0, exp(upper_CI), 1/exp(upper_CI)))

noncl_T_sum_df<-bind_cols(tibble(bin_name = rownames(summary(noncl_T_fit)$coef)),
                          summary(noncl_T_fit)$coef %>% as.tibble()) %>%
  filter(bin_name != "(Intercept)") %>%
  rename(estimate = Estimate, std_error = `Std. Error`) %>%
  dplyr::select(bin_name,estimate,std_error) %>%
  mutate(lower_CI = estimate - 1.96*std_error) %>%
  mutate(upper_CI = estimate + 1.96*std_error) %>%
  mutate(fold = ifelse(estimate > 0, exp(estimate), 1/exp(estimate))) %>%
  mutate(lower_fold = ifelse(estimate > 0, exp(lower_CI), 1/exp(lower_CI))) %>%
  mutate(upper_fold = ifelse(estimate > 0, exp(upper_CI), 1/exp(upper_CI)))

noncl_G_sum_df<-bind_cols(tibble(bin_name = rownames(summary(noncl_G_fit)$coef)),
                          summary(noncl_G_fit)$coef %>% as.tibble()) %>%
  filter(bin_name != "(Intercept)") %>%
  rename(estimate = Estimate, std_error = `Std. Error`) %>%
  dplyr::select(bin_name,estimate,std_error) %>%
  mutate(lower_CI = estimate - 1.96*std_error) %>%
  mutate(upper_CI = estimate + 1.96*std_error) %>%
  mutate(fold = ifelse(estimate > 0, exp(estimate), 1/exp(estimate))) %>%
  mutate(lower_fold = ifelse(estimate > 0, exp(lower_CI), 1/exp(lower_CI))) %>%
  mutate(upper_fold = ifelse(estimate > 0, exp(upper_CI), 1/exp(upper_CI)))

kat_sum_df<-bind_cols(tibble(bin_name = rownames(summary(kat_event_fit)$coef)),
                      summary(kat_event_fit)$coef %>% as.tibble()) %>%
  filter(bin_name != "(Intercept)") %>%
  rename(estimate = Estimate, std_error = `Std. Error`) %>%
  dplyr::select(bin_name,estimate,std_error) %>%
  mutate(lower_CI = estimate - 1.96*std_error) %>%
  mutate(upper_CI = estimate + 1.96*std_error) %>%
  mutate(fold = ifelse(estimate > 0, exp(estimate), 1/exp(estimate))) %>%
  mutate(lower_fold = ifelse(estimate > 0, exp(lower_CI), 1/exp(lower_CI))) %>%
  mutate(upper_fold = ifelse(estimate > 0, exp(upper_CI), 1/exp(upper_CI)))


omi_sum_df<-bind_cols(tibble(bin_name = rownames(summary(omi_event_fit)$coef)),
                      summary(omi_event_fit)$coef %>% as.tibble()) %>%
  filter(bin_name != "(Intercept)") %>%
  rename(estimate = Estimate, std_error = `Std. Error`) %>%
  dplyr::select(bin_name,estimate,std_error) %>%
  mutate(lower_CI = estimate - 1.96*std_error) %>%
  mutate(upper_CI = estimate + 1.96*std_error) %>%
  mutate(fold = ifelse(estimate > 0, exp(estimate), 1/exp(estimate))) %>%
  mutate(lower_fold = ifelse(estimate > 0, exp(lower_CI), 1/exp(lower_CI))) %>%
  mutate(upper_fold = ifelse(estimate > 0, exp(upper_CI), 1/exp(upper_CI)))

tot_sum_df<-rbind(
  noncl_sum_df%>%mutate(cluster="non-clust_tot"),
  noncl_T_sum_df%>%mutate(cluster="non-clust_CtoT"),
  noncl_G_sum_df%>%mutate(cluster="non-clust_CtoG"),
  kat_sum_df%>%mutate(cluster="kataegis"),
  omi_sum_df%>%mutate(cluster="omikli")
)

tot_sum_df%>%
  ggplot(aes(x=factor(bin_name,levels=bin_order),y=estimate,col=cluster))+
  #  geom_line()+
  geom_hline(yintercept=0,color="black",linetype="dashed")+
  #geom_point(size=5,position=position_dodge(width = .5))+
  geom_pointrange(aes(ymin=lower_CI,ymax=upper_CI,size=3),position=position_dodge(width = .5),size=1
  )+
  theme_classic()+
  
  ylab("log2(enrichment of APOBEC mediated SNVs)")+
  theme(axis.title=element_text(size=20),
        axis.text.y=element_text(size=20),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks = element_line(size = 2))+
  scale_color_manual(values = c("#DC8D8D","#ABDDFB","#911eb4","#f58231","#aaffc3"))+
  xlab("<-early late->")+
  facet_wrap(~bin_group,scales="free_x")


conflicted::conflicts_prefer(dplyr::filter)
tot_sum_df<-tot_sum_df%>%mutate(log2_fold=log2(fold),
                                log2_lower=log2(lower_fold),
                                log2_upper=log2(upper_fold))
tot_sum_df
p1<-tot_sum_df%>%
  #filter(cluster%in%c("kataegis","omikli"))%>%
  ggplot(aes(x=factor(bin_name,levels=bin_order),y=log2_fold,col=cluster))+
  #  geom_line()+
  geom_hline(yintercept=0,color="black",linetype="dashed")+
  #geom_point(size=5,position=position_dodge(width = .5))+
  geom_pointrange(aes(ymin=log2_lower,ymax=log2_upper,size=3),position=position_dodge(width = .5),size=1
  )+
  theme_classic()+
  
  ylab("enrichment of APOBEC mediated SNVs")+
  theme(axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30)
  )+scale_color_manual(values = c("#DC8D8D","#ABDDFB","#911eb4","#f58231","#aaffc3"))+
  xlab("<-early late->")+
  facet_wrap(~bin_group,scales="free_x")#+
#scale_y_continuous(breaks = seq(-3, 7, by = 2),
#                   lim=c(-4.3,5)
#                   )
p1
p2<-tot_sum_df%>%
  filter(cluster%in%c("non-clust_CtoT","non-clust_CtoG"))%>%
  ggplot(aes(x=factor(bin_name,levels=bin_order),y=estimate,col=cluster))+
  #  geom_line()+
  geom_hline(yintercept=0,color="black",linetype="dashed")+
  #geom_point(size=5,position=position_dodge(width = .5))+
  geom_pointrange(aes(ymin=lower_CI,ymax=upper_CI,size=3),position=position_dodge(width = .5),size=1
  )+
  theme_classic()+
  
  ylab("enrichment of APOBEC mediated SNVs")+
  theme(axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  scale_color_manual(values = c("#DC8D8D","#ABDDFB","#911eb4","#f58231","#aaffc3"))+
  xlab("<-early late->")+
  facet_wrap(~bin_group,scales="free_x")+
  scale_y_continuous(breaks = seq(-4, 8, by = 2),
                     lim=c(-4.3,5)
  )

ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/enrichment_cluster.pdf",p1,
       width=10,height=10)

ggsave("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig4/enrichment_noncluster.pdf",p2,
       width=10,height=10)
