library(dplyr)
library(tidyverse)
library(ggplot2)
#library(MASS)
#WT, excl_intergenic, RR27
##A3A mediated clsuter event##

metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")

cl_df<-read_tsv("/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")
cl_df%>%dplyr::select(samples,imd_cutoff)%>%unique()%>%arrange(imd_cutoff)
files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
                          ".excl.*gd$",
                          full.names=T)

t_df<-read_tsv(files_to_read[1])
vcf_tmp<-lapply(files_to_read,function(x){
  read_tsv(x)%>%mutate(id=gsub(".mutect2.*","",basename(x)))%>%
    dplyr::select(id,`#CHROM`,POS,REF,ALT,sig_cont,Func_refGene,Gene_refGene,gene_dir)

})
merge_df<-do.call(rbind,vcf_tmp)
merge_df<-merge_df%>%filter(grepl("[ATGC][ATGC]>[ATGC][ATGC]",sig_cont))

merge_df<-left_join(merge_df,metadata%>%select(-`m/d`))

merge_df<-left_join(merge_df,cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)%>%plyr::rename(c("chr"="#CHROM","start"="POS","samples"="id")))
merge_df<-merge_df%>%mutate(IMD=ifelse(is.na(IMD),0,IMD),cluster_id=ifelse(is.na(cluster_id),".",cluster_id),cluster=ifelse(is.na(cluster_type_omikli_upto_3),"non-clust",cluster_type_omikli_upto_3))%>%select(-cluster_type_omikli_upto_3)


A3A_cl_merge_df<-merge_df%>%filter(dose%in%c("3ug","100ng"))%>%filter(APOBEC=="A3A")

#####


#conflicted::conflict_prefer("select", "dplyr")
conflicted::conflicts_prefer(dplyr::filter)



epi_annot_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/05_vcf/annotation_total/bin5_tot_excl_intergenic//APOBEC_epi_merge.A3A.spread.tsv")




metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")

#metadata
cl_df<-read_tsv("/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")
#cl_df
cl_df%>%dplyr::select(samples,imd_cutoff)%>%unique()%>%arrange(imd_cutoff)
files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
                          ".excl.*gd$",
                          full.names=T)
#files_to_read
conflicted::conflicts_prefer(dplyr::select)
t_df<-read_tsv(files_to_read[1])
vcf_tmp<-lapply(files_to_read,function(x){
  read_tsv(x)%>%mutate(id=gsub(".mutect2.*","",basename(x)))%>%
    dplyr::select(id,`#CHROM`,POS,REF,ALT,sig_cont,Func_refGene,Gene_refGene)

})
merge_df<-do.call(rbind,vcf_tmp)
#merge_df<-merge_df%>%filter(grepl("[ATGC][ATGC]>[ATGC][ATGC]",sig_cont))

merge_df<-left_join(merge_df,metadata%>%select(-`m/d`))

merge_df<-left_join(merge_df,cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)%>%plyr::rename(c("chr"="#CHROM","start"="POS","samples"="id")))
merge_df<-merge_df%>%mutate(IMD=ifelse(is.na(IMD),0,IMD),cluster_id=ifelse(is.na(cluster_id),".",cluster_id),cluster=ifelse(is.na(cluster_type_omikli_upto_3),"non-clust",cluster_type_omikli_upto_3))%>%select(-cluster_type_omikli_upto_3)


merge_df<-left_join(merge_df%>%select(id,`#CHROM`,POS,REF,ALT,sig_cont,cluster_id,cluster),
                    epi_annot_df%>%select(`#CHROM`,r2,id,RepliSeq,RNAseq,H3K9me3,H3K27me3),
                    by=c("id"="id","#CHROM"="#CHROM","POS"="r2"))

ref_nonTCN_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/06_ref/final/combination/bin5/RR27/excl_intergenic/context/RR27.nonTCN.q5.q5.q5.size.tsv",

                 col_names=c("RepliSeq","RNAseq","H3K27me3","size"),
                 col_types=cols(.default="c",RepliSeq="c",RNAseq="c",size="c",H3K27me3="c"))%>%
  plyr::rename(c("size"="size_nonTCN"))
ref_NTN_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/06_ref/final/combination/bin5/RR27/excl_intergenic/context/RR27.NTN.q5.q5.q5.size.tsv",

                        col_names=c("RepliSeq","RNAseq","H3K27me3","size"),
                        col_types=cols(.default="c",RepliSeq="c",RNAseq="c",size="c",H3K27me3="c"))%>%
  plyr::rename(c("size"="size_NTN"))
#ref_df<-ref_df%>%group_by(RepliSeq,RNAseq)%>%
#  dplyr::summarise(tot_size=sum(size))%>%
#  plyr::rename(c("tot_size"="size"))
ref_df<-left_join(ref_nonTCN_df,ref_NTN_df)%>%
  mutate(size=as.double(size_nonTCN)+as.double(size_NTN))%>%
  select(-size_nonTCN,-size_NTN)

bin6_ref_df<-ref_df
bin6_ref_df$size%>%sum()
ref_df$RepliSeq<-as.character(ref_df$RepliSeq)
ref_df$RNAseq<-as.character(ref_df$RNAseq)

merge_df<-(merge_df%>%filter(grepl("A3A",id))%>%filter(!grepl("TP53",id))%>%
             filter(!grepl("Ctrl",id))%>%
             filter(!id%in%c("A3A_1st_C3","A3A_C3_TP53KO_C3")))

A3A_cl_merge_df
A3A_trans_df<-left_join(epi_annot_df,A3A_cl_merge_df%>%select(id,`#CHROM`,POS,gene_dir,sig_cont,REF,ALT),by=c("r2"="POS","id"="id","#CHROM"="#CHROM"))





tot_epi_fin_df<-left_join(ref_df,merge_df%>%
                            #filter(id==nm)%>%
                            #filter(grepl("_3",id))%>%
                            #filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                            #filter(grepl("TC>[TG]",sig_cont))%>%
                            filter(grepl("[ACGT]T>[ACG][ACGT]",sig_cont)|grepl("[ACG]C>[TG][ACGT]",sig_cont))%>%
                            filter(!grepl("C>A",sig_cont))%>%
                            dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,H3K27me3,sig_cont)%>%
                            unique()%>%
                            group_by(RepliSeq,RNAseq,H3K27me3)%>%
                            dplyr::summarise(n=n())%>%ungroup()
)
#  tot_epi_fin_df
tot_epi_fin_df[is.na(tot_epi_fin_df)]<-0



tot_epi_fin_df<-tot_epi_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

tot_epi_fin_df<-tot_epi_fin_df%>%plyr::rename(c("n"="count"))

tot_epi_fin_df<-tot_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=10*size*2)


tot_epi_fin_df
library(MASS)
conflicted::conflict_prefer(`dplyr::select`)



library(MASS)
tot_epi_fin_df<-tot_epi_fin_df%>%
  mutate(RepliSeq=factor(RepliSeq,levels=c("2","0","1","3","4")))%>%
  mutate(RNAseq=factor(RNAseq,levels=c("2","0","1","3","4")))%>%
  #  mutate(H3K9me3=factor(H3K9me3,levels=c("2","0","1","3","4")))%>%
  mutate(H3K27me3=factor(H3K27me3,levels=c("2","0","1","3","4")))


tot_fit <- glm.nb(count ~ RepliSeq+RNAseq+H3K27me3+offset(log(sample_size/10)) , data=tot_epi_fin_df)

bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3","RepliSeq4","RepliSeq5",
              "RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5",
              #              "H3K9me31","H3K9me32","H3K9me33","H3K9me34","H3K9me35",
              "H3K27me31","H3K27me32","H3K27me33","H3K27me34","H3K27me35"
)


t_tot_df<-tibble(bin_name = names(tot_fit $coefficients), coef = tot_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))%>%
  rbind(data.frame(bin_name=c("RepliSeq2","RNAseq2","H3K27me32"),coef=c(0,0,0),log2_coef=c(0,0,0)))


new_bin6_RR3D_tot_event_df<-t_tot_df%>%mutate(fold=exp(coef))

new_bin_order<-c("RepliSeq4","RepliSeq3","RepliSeq2","RepliSeq1","RepliSeq0",

                 #"DHS0","DHS1","DHS2","DHS3",
                 #"H3K36me30","H3K36me31","H3K36me32","H3K36me33",
                 "RNAseq0","RNAseq1","RNAseq2","RNAseq3","RNAseq4",
                 #                 "H3K9me30","H3K9me31","H3K9me32","H3K9me33","H3K9me34",
                 "H3K27me30","H3K27me31","H3K27me32","H3K27me33","H3K27me34"
)

new_bin6_RR3D_tot_event_df<-new_bin6_RR3D_tot_event_df%>%
  arrange(bin_name)%>%
  mutate(bin_group=rep(c("H3K27me3","RNAseq","RepliSeq"),each=5))

ci_df<-tot_fit%>%
  confint()%>%
  as.tibble()%>%
  plyr::rename(c("2.5 %"="lower_ci_coef",
                 "97.5 %"="upper_ci_coef"))%>%
  mutate(lower_ci=exp(lower_ci_coef),
         upper_ci=exp(upper_ci_coef))%>%
  mutate(bin_name=rownames(tot_fit%>%confint()))%>%
  rbind(data.frame(bin_name=c("RepliSeq2","RNAseq2","H3K9me32","H3K27me32"),lower_ci_coef=c(0,0,0,0),lower_ci=c(1,1,1,1),upper_ci=c(1,1,1,1),upper_ci_coef=c(0,0,0,0)))%>%
  filter(!bin_name=="(Intercept)")

new_bin6_RR3D_tot_event_df<-
  left_join(new_bin6_RR3D_tot_event_df,ci_df)

p<-new_bin6_RR3D_tot_event_df%>%
  #filter(group%in%c("non-clust","kataegis","omikli"))%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=fold,group=bin_group))+
  geom_point(size=5)+
  geom_errorbar(aes(ymin=lower_ci,ymax=upper_ci,width=0))+

  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("Fold change  in A3A mutations")+
  theme(plot.title=element_text(size=20),
        axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=1,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(0,3,by=0.5),
                     lim=c(0,3))+
  facet_wrap(~bin_group,scales="free_x",ncol=4)+
  #ggtitle("WT_own_bin4")+
  scale_color_manual(values=c("RED","black"))#+
#ggtitle("WT_ori_bin4")+
#ggtitle(nm)
#p
p
ggsave(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/bin5_excl_intergenic/WT_merged.RR27.excl_intergenic.enrich.else_TCN.pdf"),p,
       width=10,height=5)


write.table(new_bin6_RR3D_tot_event_df,paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/bin5_excl_intergenic/WT_merged.RR27.excl_intergenic.enrich.else_TCN.txt"),row.names=F,
            quote=F,
            sep="\t")
tot_epi_fin_cont_df<-
  tot_epi_fin_df%>%
  mutate(TP53="WT",bin_type="bin5")%>%mutate(ref="own",sample="organoid",region="excl_intergenic")%>%
  mutate(size=size*2)%>%
  select(-sample_size)#%>%
#mutate(id=nm)
tot_epi_fin_cont_df%>%
  select(sample,ref,bin_type,TP53,RepliSeq,RNAseq,H3K27me3,count,size,region)%>%
  write.table(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/bin5_excl_intergenic/WT_merged.RR27.excl_intergenic.merged.else_TCN.count.tsv"),
              sep="\t",
              quote=F,
              row.names=F)

tot_epi_fin_cont_df%>%gather(seq,bin,RepliSeq:H3K27me3)%>%
  #  select(-sample_size)%>%
  mutate(TP53="WT",bin_type="bin5")%>%mutate(ref="own",sample="organoid",region="excl_intergenic")%>%
  group_by(ref,bin_type,TP53,seq,bin,region)%>%
  dplyr::summarise(tot_size=sum(size)*10*2,
                   tot_count=sum(count))%>%
  mutate(rate=tot_count/tot_size)%>%
  mutate(size=tot_size/10)%>%
  #mutate(id=nm)%>%
  write.table(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/bin5_excl_intergenic/WT_merged.RR27.excl_intergenic.merged.else_TCN.rate.tsv"),
              sep="\t",
              quote=F,
              row.names=F)


p_log<-new_bin6_RR3D_tot_event_df%>%
  #filter(group%in%c("non-clust","kataegis","omikli"))%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=log2(new_fold),group=bin_group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("Fold change  in A3A mutations")+
  theme(plot.title=element_text(size=20),
        axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=0,color="black",linetype="dashed")+
  scale_y_continuous(breaks =c(-0.3,-0.2,-0.1,0,0.1,0.2,0.3),
                     lim=c(-0.3,0.32))+
  facet_wrap(~bin_group,scales="free_x",ncol=4)+
  #ggtitle("WT_own_bin4")+
  scale_color_manual(values=c("RED","black"))#+
ggsave(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/bin5_excl_intergenic/WT_merged.excl_intergenic.enrich.log.pdf"),p_log,
       width=10,height=5)

##merge##
tot_epi_fin_df<-left_join(ref_df,merge_df%>%
                            #filter(id==nm)%>%
                            #filter(grepl("_3",id)|id=="A3A_1st_C3_100ng-1")%>%
                            filter(grepl("TP53",id))%>%filter(!grepl("Ctrl",id))%>%
                            filter(grepl("A3A",id))%>%
                            filter(id!="A3A_C3_TP53KO_C3")%>%
                            filter(grepl("C>G",sig_cont)|grepl("C>T",sig_cont))%>%
                            filter(grepl("TC>[TG]",sig_cont))%>%
                            dplyr::select(`#CHROM`,POS,cluster,RepliSeq,RNAseq,sig_cont)%>%
                            unique()%>%
                            group_by(RepliSeq,RNAseq)%>%
                            dplyr::summarise(n=n())%>%ungroup()
)
#  tot_epi_fin_df
tot_epi_fin_df[is.na(tot_epi_fin_df)]<-0



tot_epi_fin_df<-tot_epi_fin_df%>%mutate(n=ifelse(is.na(n),0,n))

tot_epi_fin_df<-tot_epi_fin_df%>%plyr::rename(c("n"="count"))

tot_epi_fin_df<-tot_epi_fin_df%>%mutate(size=as.double(size))%>%mutate(sample_size=5*size)
#tot_epi_fin_df$RepliSeq<-as.character(tot_epi_fin_df$RepliSeq)
tot_epi_fin_df$RNAseq<-as.character(tot_epi_fin_df$RNAseq)
#tot_epi_fin_df$H3K36me3<-as.character(tot_epi_fin_df$H3K36me3)


library(MASS)
#conflicted::conflict_prefer(`dplyr::select`)
tot_fit <- glm.nb(count ~ RepliSeq+RNAseq+offset(log(sample_size/5)) , data=tot_epi_fin_df)
bin_order = c("RepliSeq1","RepliSeq2","RepliSeq3","RepliSeq4",
              "RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5"
)


t_tot_df<-tibble(bin_name = names(tot_fit $coefficients), coef = tot_fit $coefficients) %>%
  filter(bin_name != "(Intercept)") %>%
  mutate(log2_coef = log2(exp(coef)))%>%
  rbind(data.frame(bin_name=c("RepliSeq0","RNAseq0"),coef=c(0,0),log2_coef=c(0,0)))


new_bin6_RR3D_tot_event_df<-t_tot_df%>%mutate(fold=exp(coef))

new_bin_order<-c("RepliSeq4","RepliSeq3","RepliSeq2","RepliSeq1","RepliSeq0",
                 #"DHS0","DHS1","DHS2","DHS3",
                 #"H3K36me30","H3K36me31","H3K36me32","H3K36me33",
                 "RNAseq0","RNAseq1","RNAseq2","RNAseq3","RNAseq4","RNAseq5"
)
new_bin6_RR3D_tot_event_df<-new_bin6_RR3D_tot_event_df%>%
  arrange(bin_name)%>%
  mutate(bin_group=rep(c("RepliSeq","RNAseq"),each=5))
new_bin6_RR3D_tot_event_df<-left_join(new_bin6_RR3D_tot_event_df,new_bin6_RR3D_tot_event_df%>%
                                        filter(grepl("2",bin_name))%>%
                                        plyr::rename(c("fold"="base_fold"))%>%
                                        select(bin_group,base_fold)
)%>%
  mutate(new_fold=fold/base_fold)
nm="KO_merge"
p<-new_bin6_RR3D_tot_event_df%>%
  #filter(group%in%c("non-clust","kataegis","omikli"))%>%
  ggplot(aes(x=factor(bin_name,levels=new_bin_order),y=new_fold,group=bin_group))+
  geom_point(size=5)+
  geom_line()+
  theme_classic()+
  #ggtitle("clustered Multivariate analysis")+
  xlab("")+ylab("Fold change  in A3A mutations")+
  theme(plot.title=element_text(size=20),
        axis.title=element_text(size=20),
        axis.text.y=element_text(size=40),
        axis.text.x=element_blank(),
        axis.ticks.length=unit(.4, "cm"),
        axis.ticks.y = element_line(size = 2),
        axis.ticks.x = element_blank(),
        strip.text.x = element_text(size = 30),
        legend.position="none"
  )+
  geom_hline(yintercept=1,color="black",linetype="dashed")+
  scale_y_continuous(breaks = seq(0,2,by=0.5),
                     lim=c(0,2))+
  facet_wrap(~bin_group,scales="free_x",ncol=4)+
  #ggtitle("WT_own_bin4")+
  scale_color_manual(values=c("RED","black"))+
  #ggtitle("WT_ori_bin4")+
  ggtitle(nm)
#p
ggsave(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/rep_bin5_rna_bin5/",nm,".pdf"),p,
       width=10,height=5)

write.table(new_bin6_RR3D_tot_event_df,paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/rep_bin5_rna_bin5/",nm,".enrich.txt"),row.names=F,
            quote=F,
            sep="\t")
tot_epi_fin_cont_df<-
  tot_epi_fin_df%>%
  mutate(TP53="KO",bin_type="bin6")%>%mutate(ref="own",sample="organoid")%>%
  select(-sample_size)%>%
  mutate(id=nm)
tot_epi_fin_cont_df%>%
  select(sample,ref,bin_type,TP53,RepliSeq,RNAseq,count)%>%
  write.table(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/rep_bin5_rna_bin5/",nm,".tsv"),
              sep="\t",
              quote=F,
              row.names=F)

tot_epi_fin_cont_df%>%gather(seq,bin,RepliSeq:RNAseq)%>%
  #  select(-sample_size)%>%
  mutate(TP53="KO",bin_type="bin6")%>%mutate(ref="own",sample="organoid")%>%
  group_by(sample,ref,bin_type,TP53,seq,bin)%>%
  dplyr::summarise(tot_size=sum(size),
                   tot_count=sum(count))%>%
  mutate(rate=tot_count/tot_size)%>%
  mutate(id=nm)%>%
  write.table(paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/R_data/by_sample/rep_bin5_rna_bin5/",nm,".merge.tsv"),
              sep="\t",
              quote=F,
              row.names=F)

