library(ggplot2)
library(dplyr)
library(tidyverse)
library(BSgenome)
library(GenomicRanges)
library(ggplot2)
library(ggprism)
library(patchwork)
library(magrittr)
library(Rsamtools)
library(stringi)

files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/indel/clonal/",
                          "indel_type_anno.vcf",
                          full.names=T)
files_to_read
tmp<-lapply(files_to_read,function(x){
  read_tsv(x)%>%mutate(id=gsub(".mutect2.*","",basename(x)))%>%select('#CHROM',POS,REF,ALT,id,type)
})
#tmp[[1]]
sig_df<-do.call(rbind,tmp)
sig_df$id%>%unique()
sig_df$id %>%unique()
sig_df%>%filter(id=="A3B_1st_C5_100ng_48h_SC-3")
metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/indel/clonal/sig/metadata.txt")
metadata%>%filter(grepl("SC",id))
metadata$id %>%unique()
sig_df<-left_join(sig_df,metadata)
sig_df<-sig_df%>%mutate(group=paste(APOBEC,dose,TP53,sep="_"))
sig_df$id%>%unique()
metadata
sig_df
merge_df<-sig_df
#sig_df%>%filter(info=="d1_c_1")%>%group_by(id,APOBEC,dose,TP53)%>%dplyr::summarise(n=n())

#d1c1_sum_df<-norm_sig_df%>%filter(info=="d1_c_1")%>%group_by()%>%dplyr:::summarise(n=n())

#d1c1_sum_df<-merge_df%>%filter(type=="d1c1")%>%group_by(id,group)%>%dplyr:::summarise(n=n())

#ggplot(sig_df%>%filter(info=="d1_c_1"),aes(x=group,y=Original))+
#  geom_boxplot()#+
#  ylim(0,13)


fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')

three_bp_df<-GRanges(seqnames=merge_df$`#CHROM`,IRanges(start=(as.numeric(merge_df$POS)), end=(as.numeric(merge_df$POS)+2)),strand="+")
refbase<-getSeq(fasta_file,three_bp_df)
refbase<-as.data.frame(refbase)$x
df<-merge_df%>%mutate(three_bp_cont=refbase)
df
out_df<-df%>%mutate(sig_cont=ifelse(substr(three_bp_cont,2,2) %in%c("C","T"),
                                    df$three_bp_cont,
                                    paste0(substr(stri_reverse(chartr("ATGC","TACG",three_bp_cont)),1,3))))
out_df
ten_bp_df<-GRanges(seqnames=out_df$`#CHROM`,IRanges(start=(as.numeric(out_df$POS-4)), end=(as.numeric(out_df$POS)+6)),strand="+")
refbase<-getSeq(fasta_file,ten_bp_df)
refbase<-as.data.frame(refbase)$x
df<-out_df%>%mutate(ten_bp_cont=refbase)
out_df<-df%>%mutate(ten_sig_cont=ifelse(substr(three_bp_cont,2,2) %in%c("C","T"),
                                        df$ten_bp_cont,
                                        stri_reverse(chartr("ATGC","TACG",ten_bp_cont))))
out_df

out_df%>%select(group,id)%>%unique()%>%group_by(group)%>%dplyr::summarise(sample_n=n())
out_sim_df<-out_df%>%filter(grepl("d1c",type))%>%group_by(id,group,sig_cont)%>%dplyr::summarise(n=n())%>%ungroup()%>%group_by(group,sig_cont)%>%dplyr::summarise(med_n=median(n))%>%print(n=74)

out_sim_df

contextorder16<-c("ACA","ACC","ACG","ACT","CCA","CCC","CCG","CCT","GCA","GCC","GCG","GCT","TCA","TCC","TCG","TCT")

tmp_df<-cbind(rep(out_sim_df$group%>%unique(),each=16),contextorder16)%>%as_tibble()
colnames(tmp_df)<-c("group","sig_cont")
out_sim_df%>%filter(group=="A3A_100ng_KO")
count_df<-left_join(tmp_df,out_sim_df,by = c("group"="group","sig_cont"="sig_cont"))%>%print(n=50)
count_df<-count_df%>%mutate(med_n=ifelse(is.na(med_n),0,med_n))
count_df$group%>%unique()
count_df$group<-factor(count_df$group,
                       levels=c("A3A_CTRL_WT",
                                "A3A_100ng_WT",
                                "A3A_3ug_WT",
                                "A3A_CTRL_KO",
                                "A3A_100ng_KO",
                                "A3A_3ug_KO",
                                "A3B_100ng_WT",
                                "A3B_CTRL_KO",
                                "A3B_100ng_KO",
                                "A3B_3ug_KO"))

labels<-rep(c("CTRL","0.1ug/ml","3ug/ml"),2)
names(labels)<-c("A3A_CTRL_WT","A3A_100ng_WT","A3A_3ug_WT","A3A_CTRL_KO","A3A_100ng_KO","A3A_3ug_KO")
count_df%>%filter(grepl("A3A",group))%>%ungroup()%>%select(group)%>%unique()
#pdf("/home/users/ayh/Projects/27_A3B/06_Figure_code/Fig4/4.3_TCN_context.pdf",family="ArialMT")
dev.off()
p<-ggplot(count_df%>%filter(grepl("A3A",group))%>%filter(grepl("WT",group))%>%mutate(APOBEC="A3A"),aes(x=sig_cont,y=med_n,fill=group,col=APOBEC))+
  geom_bar(stat="identity")+
  facet_wrap(~group,scale="free_x",labeller=labeller(group=labels))+
  theme_bw()+
  theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1,size=20,family="Consolas"),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        strip.text=element_text(size=30),
        legend.position="none",
        axis.ticks.length.y=unit(.5, "cm"),
        axis.ticks.y=element_line(colour="black",size=3)
  )+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill=NA, size=1),
  )+
  scale_y_continuous(expand=c(0,0),
                     limits=c(0,4.5)
  )+
  ylab("# of [-C] in TCN context")+
  xlab("")+
  scale_fill_manual(values=rep(c("#a9a9a9","#abddfb","#dc8d8d","#000075","#469990","#4363d8"),2))+
  scale_color_manual(values=c("black"))

p

library(cowplot)
save_plot("/home/users/ayh/Projects/27_A3B/06_Figure_code/Fig4/4.3_TCN_context.pdf",p,
          ncol=1,
          nrow=2,
          base_asp=7,
          #unit="px",
          
          device=cairo_pdf)

ggplot(count_df%>%filter(grepl("A3A",group)),aes(x=sig_cont,y=med_n,fill=group))+
  geom_bar(stat="identity")+
  facet_wrap(~group,scale="free_x",labeller=labeller(group=labels))+
  theme_bw()+
  theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1,size=20,family="Consolas"),
        axis.text.y=element_text(size=55),
        axis.title.y=element_text(size=60),
        strip.text=element_text(size=30),
        legend.position="none",
        axis.ticks.length.y=unit(.5, "cm"),
        axis.ticks.y=element_line(colour="black",size=3)
  )+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = "grey", fill=NA, size=1),
  )+
  scale_y_continuous(expand=c(0,0),
                     limits=c(0,4.5)
  )+
  ylab("")+
  scale_fill_manual(values=rep(c("#fdc6c6","#fa8e8e","#f73030","#deaef3","#ab8fe3","#6f41cf"),2))



####2. linear regression with SNVs######
out_df$id<-factor(out_df$id,levels=out_df$id%>%unique())
common_indel_df<-out_df%>%group_by(`#CHROM`,POS)%>%
  dplyr::summarise(n=n())%>%
  filter(n>1)%>%
  mutate(info=paste(`#CHROM`,POS,sep="_"))
common_indel_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/indel/clonal/common_clonal_indel.vcf",
              sep="\t",
              quote=F,
              row.names=F)

out_df<-out_df%>%mutate(info=paste(`#CHROM`,POS,sep="_"))

out_df<-out_df%>%filter(!info%in%common_indel_df$info)
TCN_count_df<-out_df%>%filter(grepl("d1c",type))%>%group_by(id,group,sig_cont)%>%dplyr::summarise(n=n())%>%ungroup()
TCN_count_df%>%select(id,group)%>%unique()
out_df
out_df%>%filter(grepl("d1c",type))%>%group_by(id,group,sig_cont)%>%dplyr::summarise(n=n())%>%ungroup()%>%filter(grepl("TC[A,C,G,T]",sig_cont))

#id_df<-out_df%>%filter(grepl("d1c",type))%>%group_by(id,group,sig_cont)%>%dplyr::summarise(n=n())%>%sele
tmp_df<-out_df%>%filter(grepl("d1c",type))%>%group_by(id,group,sig_cont)%>%dplyr::summarise(n=n())%>%ungroup()%>%filter(grepl("TC[A,C,G,T]",sig_cont))%>%spread(sig_cont,n)
tmp_df
tmp_df[is.na(tmp_df)]<-0

TCN_count_df<-left_join(expand.grid(id=levels((TCN_count_df%>%select(id)%>%unique())$id)),tmp_df%>%gather(sig_cont,n,TCA:TCT)%>%group_by(id,group)%>%dplyr::summarise(nTCN=sum(n))%>%select(id,nTCN))
TCN_count_df[is.na(TCN_count_df)]<-0
#TCN_count_df$id
#TCN_count_df$id
id_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/indel/id.txt")

SNV_df<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/indel/APOBEC_sig_df.3.txt")%>%
  plyr::rename(c("APOBEC_mutation"="2and13sum","id"="snv_id"))%>%
  left_join(id_df)
SNV_df
TCN_count_df<-as.tibble(TCN_count_df)%>%plyr::rename(c("id"="indel_id"))%>%
  left_join(id_df)
SNV_indel_df<-left_join(TCN_count_df,SNV_df%>%select(snv_id,APOBEC,TP53,dose,`2and13sum`))%>%as_tibble()
SNV_indel_df$TP53<-factor(SNV_indel_df$TP53,levels=c("WT","KO"))
SNV_indel_df$dose<-factor(SNV_indel_df$dose,levels=c("CTRL","100ng","3ug"))


lm_eqn <- function(df,a,b){
  m <- lm(nTCN ~ `2and13sum`, df);
  eq <- substitute(italic(y) == a + b %.% italic(x)*"\n"~~italic(r)^2~"="~r2,
                   list(a = format(unname(coef(m)[1]), digits = 2),
                        b = format(unname(coef(m)[2]), digits = 2),
                        r2 = format(summary(m)$r.squared, digits = 3)))
  as.character(as.expression(eq));
}

lm_eqn <- function(df,a,b){
  m <- lm(nTCN ~ `2and13sum`, df);
  eq <- substitute(italic(y) == a + b %.% italic(x)*"\\n"~~italic(r)^2~"="~r2,
                   list(a = format(unname(coef(m)[1]), digits = 4),
                        b = format(unname(coef(m)[2]), digits = 4),
                        r2 = format(summary(m)$r.squared, digits = 3)))
  as.character(as.expression(eq));
}


SNV_indel_df%>%filter(APOBEC=="A3A")
m<-lm(nTCN ~ `2and13sum`, SNV_indel_df%>%filter(APOBEC=="A3A"))
m<-lm(nTCN ~ `2and13sum`, SNV_indel_df%>%filter(APOBEC=="A3A")%>%filter(TP53=="WT"))
require(broom)
lm_pval<-glance(m)$p.value
lm_eqn(SNV_indel_df%>%filter(APOBEC=="A3A"))
format(summary(m)$coefficients[,4] ,digits=3)
format(summary(m)$r.squared,digits=3)
SNV_indel_df<-SNV_indel_df%>%mutate(dose=ifelse(dose=="CTRL","CTRL",
                                                ifelse(dose=="100ng","100ng/ml","3ug/ml")))
SNV_indel_df$dose<-factor(SNV_indel_df$dose,levels=c("CTRL","100ng/ml","3ug/ml"))
SNV_indel_df
SNV_indel_df%>%filter(dose=="3ug/ml")
ggplot(SNV_indel_df%>%filter(APOBEC=="A3A"),aes(x=`2and13sum`,y=nTCN))+
  geom_point(aes(col=dose,shape=TP53,size=5,stroke=5,alpha=0.5))+
  scale_shape_manual(values=c(16,4))+
  
  theme_bw()+
  theme(axis.text.x=element_text(size=40),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        axis.title.x=element_text(size=40),
        legend.text=element_text(size=30),
        legend.title=element_text(size=30),
        axis.ticks.length=unit(.5, "cm"),
        axis.ticks=element_line(colour="black",size=3)
        
  )+
  scale_x_continuous(limits=c(0,5000),
  )+
  scale_y_continuous(limits=c(0,15)
  )+
  stat_smooth(method=lm,level=0.95,
              colour="black")+
  #  scale_color_discrete(labels=c("CTRL","100ng/ml","3ug/ml"))+
  scale_color_manual(values=c("#f79430","#3030f7","#9430f7"))+
  xlab("APOBEC mediated SNVs")+
  ylab("Indels in TCN context")+
  guides(shape=guide_legend(override.aes=list(size=10),order=1,title="TP53 status"),
         color=guide_legend(override.aes=list(size=10),order=2),
         alpha="none",
         size="none")#+
SNV_indel_df2<-SNV_indel_df%>%mutate(Sample=paste(APOBEC,dose,TP53,sep="_"))
SNV_indel_df2$Sample<-factor(SNV_indel_df2$Sample,levels=c("A3A_CTRL_WT","A3A_100ng/ml_WT","A3A_3ug/ml_WT","A3A_CTRL_KO","A3A_100ng/ml_KO","A3A_3ug/ml_KO","A3B_100ng/ml_WT","A3B_CTRL_KO","A3B_100ng/ml_KO","A3B_3ug/ml_KO"))
SNV_indel_df2$Sample%>%unique()
pdf("/home/users/ayh/Projects/27_A3B/06_Figure_code/Fig4/4.4_linear_regression.pdf",width=15,height=10)
library(ggpmisc)
SNV_indel_df2
ggplot(SNV_indel_df2%>%mutate(type=paste(APOBEC,dose,TP53,sep="_"))%>%filter(APOBEC=="A3A")%>%filter(TP53=="WT"),aes(x=`2and13sum`,y=nTCN))+
  geom_point(aes(size=5,stroke=5))+
  theme_classic()+
  theme(axis.text.x=element_text(size=40),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        axis.title.x=element_text(size=40),
        legend.text=element_text(size=30),
        legend.title=element_text(size=30),
        axis.ticks.length=unit(.5, "cm"),
        axis.ticks=element_line(colour="black",size=3)
        
  )+
  scale_x_continuous(limits=c(0,5000),
  )+
  scale_y_continuous(limits=c(0,15)
  )+
  stat_poly_line(color="red") +
  stat_poly_eq(aes(label = after_stat(eq.label)),size=10,label.x=0.9,label.y=0.1) +
  stat_poly_eq(label.y = 0.05,size=10,label.x=0.76) +
  
  #  scale_color_discrete(labels=c("CTRL","100ng/ml","3ug/ml"))+
  scale_color_manual(values=c("#fdc6c6","#fa8e8e","#f73030","#deaef3","#ab8fe3","#6f41cf"))+
  xlab("APOBEC mediated SNVs")+
  ylab("# of [-C] in TCN context")+
  guides(color=guide_legend(override.aes=list(size=10)),
         size="none")
pdf("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/4.4_linear_regression.v2.pdf",width=15,height=10)
ggplot(SNV_indel_df2%>%mutate(type=paste(APOBEC,dose,TP53,sep="_"))%>%filter(APOBEC=="A3A")%>%filter(TP53=="WT")%>%filter(!grepl("neg",indel_id)),aes(x=`2and13sum`,y=nTCN))+
  geom_point(aes(size=5,stroke=5))+
  theme_classic()+
  theme(axis.text.x=element_text(size=40),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        axis.title.x=element_text(size=40),
        legend.text=element_text(size=30),
        legend.title=element_text(size=30),
        axis.ticks.length=unit(.5, "cm"),
        axis.ticks=element_line(colour="black",size=3)
        
  )+
  scale_x_continuous(limits=c(0,5000),
  )+
  scale_y_continuous(limits=c(0,15)
  )+
  geom_smooth(method="lm",formula=y~x,colour="red",fullrange=TRUE)+
  #stat_poly_line(color="red") +
  #stat_poly_eq(aes(label = after_stat(eq.label)),size=10,label.x=0.9,label.y=0.1) +
  #stat_poly_eq(label.y = 0.05,size=10,label.x=0.76) +
  
  #  scale_color_discrete(labels=c("CTRL","100ng/ml","3ug/ml"))+
  scale_color_manual(values=c("#fdc6c6","#fa8e8e","#f73030","#deaef3","#ab8fe3","#6f41cf"))+
  xlab("APOBEC mediated SNVs")+
  ylab("# of [-C] in TCN context")+
  guides(color=guide_legend(override.aes=list(size=10)),
         size="none")+
  geom_text(x = 1000, y = 10, label = lm_eqn(SNV_indel_df2%>%mutate(type=paste(APOBEC,dose,TP53,sep="_"))%>%filter(APOBEC=="A3A")%>%filter(TP53=="WT")%>%filter(!grepl("neg",indel_id))), parse = TRUE)

dev.off()

SNV_indel_df2%>%mutate(type=paste(APOBEC,dose,TP53,sep="_"))%>%arrange(APOBEC,TP53,dose,snv_id)%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/indel_snv_regression.tsv",
              sep="\t",
              quote=F,
              row.names=F)




ggplot(SNV_indel_df2%>%mutate(type=paste(APOBEC,dose,TP53,sep="_"))%>%filter(APOBEC=="A3A"),aes(x=`2and13sum`,y=nTCN))+
  geom_point(aes(col=Sample,size=5,stroke=5))+
  theme_bw()+
  theme(axis.text.x=element_text(size=40),
        axis.text.y=element_text(size=40),
        axis.title.y=element_text(size=40),
        axis.title.x=element_text(size=40),
        legend.text=element_text(size=30),
        legend.title=element_text(size=30),
        axis.ticks.length=unit(.5, "cm"),
        axis.ticks=element_line(colour="black",size=3)
        
  )+
  scale_x_continuous(limits=c(0,5000),
  )+
  scale_y_continuous(limits=c(0,15)
  )+
  stat_smooth(method=lm,level=0.95,
              colour="black")+
  #  scale_color_discrete(labels=c("CTRL","100ng/ml","3ug/ml"))+
  scale_color_manual(values=c("#fdc6c6","#fa8e8e","#f73030","#deaef3","#ab8fe3","#6f41cf"))+
  xlab("APOBEC mediated SNVs")+
  ylab("# of [-C] in TCN context")+
  guides(color=guide_legend(override.aes=list(size=10)),
         size="none")





ggplot(count_df%>%filter(grepl("A3B",group)),aes(x=sig_cont,y=med_n))+
  geom_bar(stat="identity")+
  facet_wrap(~group)

pval_df<-out_df%>%group_by(id,type,group)%>%dplyr:::summarise(n=n())%>%filter(grepl("d1c[1,2,3,4,5]",type) | grepl("d1t[1,2,3,4,5]",type))%>%ungroup()%>%
  rstatix::group_by(group) %>%
  rstatix::wilcox_test(formula=n ~ type)
#out_df$type<-factor(out_df$type,levels=)
out_df$group<-factor(out_df$group,levels=c("Ctrl","100ng","3ug"))
test_df1<-out_df%>%group_by(id,type,group)%>%dplyr:::summarise(num=n())%>%filter(grepl("d1c[1,2,3,4,5]",type) | grepl("d1t[1,2,3,4,5]",type))%>%ungroup()%>%select(group,num,type,id)%>%spread(type,num)
test_df1[is.na(test_df1)]<-0
test_df1$group
pval_df<-test_df1%>%gather(type,num,d1c1:d1t5)%>%
  rstatix::group_by(type) %>%
  rstatix::wilcox_test(num ~ group,ref.group="Ctrl",p.adjust.method="bonferroni") %>%
  rstatix::add_xy_position()
pavl_df<-pval_df%>%mutate(y.position=rep(c(13.4,16.3),9))
pval_df
ggplot(test_df1%>%gather(type,num,d1c1:d1t5)%>%filter(grepl("d1c[1,2,3,4,5]",type) | grepl("d1t[1,2,3,4,5]",type)),aes(x=group,y=num))+
  geom_boxplot()+
  facet_wrap(~type,ncol=4)+
  #  add_pvalue(pval_df%>%mutate(y.position=rep(c(13.4,16.3),9)),label.size=5)+
  theme(axis.text=element_text(size=15),
        axis.title=element_text(size=20,face="bold"),
        strip.text = element_text(size=15),
        
  )+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))+
  scale_y_continuous(limits=c(0,20),
                     breaks=seq(0,20,by=5))+
  xlab("")+
  ylab("# of mutations")
#ylim(c(0,20))
#out_df%>%filter(grepl("d1c",type)|grepl("d1t",type))%>%filter(grepl("d1c",type))%>%group_by(group,type)%>%dplyr:::summarise(n=n())
#out_df
#out_df

out_df%>%group_by(id,type,group)%>%dplyr::summarise(n=n())%>%filter(type=="d1t6")
out_df%>%group_by(id,type,group)%>%dplyr::summarise(n=n())%>%filter(type=="i1t5")

ggplot(out_df%>%group_by(id,type,group)%>%dplyr::summarise(n=n())%>%filter(type=="d1t6"),aes(x=group,y=n))+
  geom_boxplot()

ggplot(out_df%>%group_by(id,type,group)%>%dplyr::summarise(n=n())%>%filter(type=="i1t5"),aes(x=group,y=n))+
  geom_boxplot()
out_df<-out_df%>%mutate(FOR_5bp=stri_reverse(substr(ten_sig_cont,1,5)))%>%
  mutate(BAK_5bp=substr(ten_sig_cont,7,11))
out_df%>%select(POS,REF,ALT,ten_bp_cont,ten_sig_cont,FOR_5bp,BAK_5bp)
out_df
tmp<-lapply(out_df$FOR_5bp,function(x){
  #  print(x)
  
  pattern2=paste0("^",substr(x,1,1),"+")
  #  print(pattern2)
  result<-gregexpr(pattern=pattern2,x)[[1]]
  data.frame("dir"="5bp","rep"=substr(x,1,1),length=attr(result,"match.length"))
  
})
tmp
#colnames(out_df)%>%length()
out_df1<-cbind(out_df,do.call(rbind,tmp))
out_df1

tmp2<-lapply(out_df$BAK_5bp,function(x){
  #  print(x)
  
  pattern2=paste0("^",substr(x,1,1),"+")
  #  print(pattern2)
  result<-gregexpr(pattern=pattern2,x)[[1]]
  data.frame("dir"="3bp","rep"=substr(x,1,1),length=attr(result,"match.length"))
  
})
out_df2<-cbind(out_df,do.call(rbind,tmp2))
out_df<-rbind(out_df1,out_df2)

T_out_df<-out_df%>%select(`#CHROM`,"POS","REF","ALT","id","dir","rep","length")%>%filter(rep=="T")%>%group_by(`#CHROM`,POS,id)%>%dplyr:::summarise("T_n"=sum(length))%>%mutate(mut_info=paste(`#CHROM`,POS,id,sep="\t"))%>%ungroup()%>%select(mut_info,"T_n")
uniq_out_df<-out_df%>%mutate(info=paste0(`#CHROM`,"\\t",POS))%>%select(`#CHROM`,POS,REF,ALT,sig_cont,group,ten_sig_cont,ten_bp_cont,FOR_5bp,BAK_5bp,id,type)%>%unique()
uniq_out_df<-uniq_out_df%>%mutate(mut_info=paste(`#CHROM`,POS,id,sep="\t"))
uniq_out_df
T_out_df
uniq_out_df<-left_join(uniq_out_df,T_out_df)
uniq_out_df<-uniq_out_df%>%mutate(T_n=ifelse(is.na(T_n),0,T_n))
uniq_out_df
ggplot(uniq_out_df%>%filter(sig_cont %in%c("TCA","TCT","TCG"))%>%filter(type=="d1c1"),aes(x=T_n))+
  geom_histogram(binwidth=1)+
  scale_x_continuous(breaks=c(1:10))+
  facet_wrap(~sig_cont)

ggplot(uniq_out_df%>%filter(sig_cont %in%c("TCA","TCT","TCG"))%>%filter(type=="d1c2"),aes(x=T_n))+
  geom_histogram(binwidth=1)
scale_x_continuous(breaks=c(1:10))



ggplot(uniq_out_df,aes(x=type))+
  geom_histogram(stat="count")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  #  facet_wrap(~type,scale="free_y")+
  scale_y_continuous(breaks=seq(0,200,by=10))

uniq_out_df  %>%filter(grepl("d1t[1-5]",type))

ggplot(uniq_out_df  %>%filter(grepl("d1t[1-5]",type))%>%group_by(id,group)%>%dplyr:::summarise(n=n()),aes(x=group,y=n))+
  geom_boxplot()

ggplot(uniq_out_df%>%filter(grepl("d1t[1-5]",type))%>%filter(sig_cont %in%c("TTA","TTT","TTG","TTC")),aes(x=T_n))+
  geom_histogram(binwidth=1)+
  facet_wrap(~group)+
  scale_x_continuous(breaks=c(1:10))


uniq_out_df

nrowuniq_out_df%>%filter(group=="Ctrl")%>%select(id)%>%unique()
uniq_out_sig_df<-do.call(rbind,lapply(c("Ctrl","100ng","3ug"), function(x){
  A<-uniq_out_df%>%filter(type=="d1c1")%>%filter(group==x)
  contexttable16<-table(A$sig_cont)[contextorder16]
  names(contexttable16)<-contextorder16
  contexttable16[is.na(contexttable16)]=0
  contexttable16%>%as.data.frame()%>%mutate(group=x)%>%mutate(tot_n=nrow(uniq_out_df%>%filter(group==x)%>%select(id)%>%unique()))%>%mutate(normalized_n=Freq/tot_n)
})
)
uniq_out_sig_df$group<-factor(uniq_out_sig_df$group,levels=c("Ctrl","100ng","3ug"))
ggplot(uniq_out_sig_df,aes(x=Var1,y=normalized_n,fill=group))+
  geom_bar(stat="identity")+
  facet_wrap(~group)+
  theme_bw()+
  theme(axis.text=element_text(size=20),
        axis.title=element_text(size=25,face="bold"),
        strip.text = element_text(size=25),
        legend.text = element_text(size=15),
        legend.title = element_text(size=15),
        axis.text.x=element_text(angle=45,hjust=1),
        legend.position="none"
        
        
  )+
  
  
  xlab("")+
  ylab("normalized_count")



#uniq_out_df%>%view()
c("ACA","ACC","ACG","ACT","CCA","CCC","CCG","CCT","GCA","GCC","GCG","GCT","TCA","TCC","TCG","TCT")
uniq_out_df_d1t<-uniq_out_df%>%filter(grepl("d1t[1-5]",type))
contextorder16<-c("ACA","ACC","ACG","ACT","CCA","CCC","CCG","CCT","GCA","GCC","GCG","GCT","TCA","TCC","TCG","TCT")
####d1c1####
table(uniq_out_df$sig_cont)
contexttable16<-table((uniq_out_df%>%filter(type=="d1c1"))$sig_cont)[contextorder16]
names(contexttable16)<-contextorder16
contexttable16[is.na(contexttable16)]=0
####d1t####
#table(uniq_out_df$sig_cont)
contextorder16<-c("ATA","ATC","ATG","ATT","CTA","CTC","CTG","CTT","GTA","GTC","GTG","GTT","TTA","TTC","TTG","TTT")

contexttable16<-table(uniq_out_df_d1t$sig_cont)[contextorder16]
names(contexttable16)<-contextorder16
contexttable16[is.na(contexttable16)]=0


x<-barplot(contexttable16,las=2,
           names.arg=contextorder16,
           cex.names=0.5,
           
           beside="T",
           ylim=range(pretty(c(0,max(contexttable16))))
           #          legend=mut16
)
x
uniq_out_df%>%filter(type=="d1c1")%>%group_by(id,group,type)%>%mutate(TCN=ifelse(grepl("TC[A,T,G,C]",sig_cont),1,0))%>%mutate(nonTCN=ifelse(!grepl("TC[A,T,G,C]",sig_cont),1,0))%>%dplyr:::summarise(TCN=sum(TCN),nonTCN=sum(nonTCN))
uniq_out_df$con
uniq_out_TCN_df<-uniq_out_df%>%filter(type=="d1c1")%>%group_by(id,group,type)%>%mutate(TCN=ifelse(grepl("TC[A,T,G,C]",sig_cont),1,0))%>%mutate(nonTCN=ifelse(!grepl("TC[A,T,G,C]",sig_cont),1,0))%>%dplyr:::summarise(TCN=sum(TCN),nonTCN=sum(nonTCN))%>%gather(context,n,TCN:nonTCN)
uniq_out_TCN_df$context<-factor(uniq_out_TCN_df$context,levels=c("TCN","nonTCN"))
pval_uniq_out_TCN_df<-uniq_out_TCN_df%>%
  rstatix::group_by(group)%>%
  rstatix::wilcox_test(n~context)%>%
  rstatix::adjust_pvalue(p.col = "p", method = "bonferroni") %>%
  rstatix::add_significance(p.col = "p.adj") %>%
  rstatix::add_xy_position(x="group",dodge=0.8)
ggplot(uniq_out_TCN_df,
       aes(x=group,y=n))+
  geom_boxplot(aes(fill=context))+
  add_pvalue(pval_uniq_out_TCN_df,
             xmin="xmin",
             xmax="xmax",
             y.position=11,
             label.size=5)+
  theme(axis.text=element_text(size=25),
        axis.title=element_text(size=25,face="bold"),
        strip.text = element_text(size=25),
        legend.text = element_text(size=15),
        legend.title = element_text(size=15)
        
  )+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))+
  xlab("")+
  ylab("# of d1c1 type deletion")
uniq_out_df$sig_cont==regex("TC[A,G,T]")
#uniq_out_df$type[stri_match(uniq_out_df$sig_cont,regex="TC[A,G,T]")]

snv_files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/A3A/09_clonalization/03_strelka/snv/sig",
                              "exposures.tsv",
                              full.names=T)
tmp2<-lapply(snv_files_to_read,function(x){
  read_tsv(x,col_types=cols(`#CHROM`="c"))%>%
    mutate(id=gsub(".somatic.*","",basename(x)))%>%
    mutate(group=ifelse(grepl("_3",id),"3ug",ifelse(grepl("_100",id),"100ng","Ctrl")))
}
)
tmp2

merged_df2<-do.call(rbind,tmp2)
merged_df2$Exposure<-round(merged_df2$Exposure,0)
merged_sum_df2<-merged_df2%>%filter(Signature %in% c("v3_2","v3_13"))%>%group_by(id,group)%>%dplyr:::summarise(`2and13sum`=sum(Exposure))

ggplot(merged_sum_df2,aes(x=group,y=log10(`2and13sum`)))+
  geom_boxplot()+
  ylim(c(0,4))

merge_df
d1c1_sum_df
merged_sum_df2
d1c1_snv_df<-left_join(d1c1_sum_df,merged_sum_df2)

lm_eqn <- function(df){
  m <- lm(`n`~`2and13sum`, df);
  #  eq=format(summary(m)$r.squared,digits=3)
  paste("italic(y)==", format(coef(m)[1], digits = 2), "+",
        format(coef(m)[2], digits = 2), "%.%italic(x)*\",\"~~italic(r)^2==",
        format(summary(m)$r.squared, digits = 3),
        "%.%~~italic(p)==",format(summary(m)$coefficients[,4][2],digits=3),
        sep = "")
}
lm_eqn(d1c1_snv_df)
d1c1_snv_df
ggplot(d1c1_snv_df,aes(x=`2and13sum`,y=n))+
  geom_point(size=5)+
  geom_smooth(method=lm)+
  #  ylim(0,5000)+
  geom_text(x = 3000, y = 2, label = lm_eqn(d1c1_snv_df), parse = TRUE,size=5)+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))+
  theme(axis.text.x=element_text(size=25),
        axis.text.y=element_text(size=25),
        axis.title=element_text(size=25))+
  xlab("# APOBEC mediated mutations")+
  ylab("# of d1c1 type deletion")


#####SNV+indel#####
###################

snv_files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/A3A/09_clonalization/03_strelka/snv",
                              "24.read_info_filtered.vcf$",
                              full.names=T)
snv_files_to_read[!grepl("TP53",snv_files_to_read)]

snv_tmp<-lapply(snv_files_to_read,function(x){
  read_tsv(x,col_types=cols(`#CHROM`="c"))%>%mutate(id=gsub(".somatic.*","",basename(x)))%>%
    mutate(group=ifelse(grepl("_3",id),"3ug",ifelse(grepl("_100",id),"100ng","Ctrl")))
})

snv_merge_df<-do.call(rbind,snv_tmp)
snv_merge_df<-snv_merge_df%>%mutate(type=ifelse(REF%in%c("C","G"),
                                                paste0(REF,">",ALT),
                                                paste0(chartr("ACGT","TGCA",REF),">",chartr("ACGT","TGCA",ALT))))
snv_indel_df<-rbind(out_df%>%select(`#CHROM`,POS,REF,ALT,id,group,type)%>%unique(),snv_merge_df%>%select(`#CHROM`,POS,REF,ALT,id,group,type))
snv_indel_df<-snv_indel_df%>%arrange(id,`#CHROM`,POS)
snv_indel_df %>%filter(type=="C>T" | grepl("t",type))

snv_indel_df<-snv_indel_df %>%
  group_by(`#CHROM`,id) %>%
  mutate(Diff = POS - lag(POS))

write.table(snv_indel_df%>%arrange(Diff)%>%filter(Diff<10),"/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/A3A/09_clonalization/03_strelka/indel/snv_indel_df.txt",
            row.names=F,
            quote = F,
            sep="\t")
snv_indel_df%>%arrange(Diff)%>%filter(Diff<10)%>%view()

snv_indel_df%>%filter(POS=="99776958")
ggplot(snv_indel_df%>%filter(Diff<10),aes(x=Diff))+
  geom_histogram()

fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')

three_bp_df<-GRanges(seqnames=merge_df$`#CHROM`,IRanges(start=(as.numeric(merge_df$POS)), end=(as.numeric(merge_df$POS)+2)),strand="+")
refbase<-getSeq(fasta_file,three_bp_df)
refbase<-as.data.frame(refbase)$x
df<-merge_df%>%mutate(three_bp_cont=refbase)
df
out_df<-df%>%mutate(sig_cont=ifelse(substr(three_bp_cont,2,2) %in%c("C","T"),
                                    df$three_bp_cont,
                                    paste0(substr(stri_reverse(chartr("ATGC","TACG",three_bp_cont)),1,3))))
out_df
ten_bp_df<-GRanges(seqnames=out_df$`#CHROM`,IRanges(start=(as.numeric(out_df$POS-4)), end=(as.numeric(out_df$POS)+6)),strand="+")
refbase<-getSeq(fasta_file,ten_bp_df)
refbase<-as.data.frame(refbase)$x
df<-out_df%>%mutate(ten_bp_cont=refbase)
out_df<-df%>%mutate(ten_sig_cont=ifelse(substr(three_bp_cont,2,2) %in%c("C","T"),
                                        df$ten_bp_cont,
                                        stri_reverse(chartr("ATGC","TACG",ten_bp_cont))))
out_df%>%filter()
out_df%>%group_by(id,type,group)%>%dplyr:::summarise(n=n())%>%filter(grepl("d1c[1,2,3,4,5]",type) | grepl("d1t[1,2,3,4,5]",type))%>%select(type)%>%table()
ggplot(out_df%>%group_by(id,type,group)%>%dplyr:::summarise(n=n())%>%filter(grepl("d1c[1,2,3,4,5]",type) | grepl("d1t[1,2,3,4,5]",type)),aes(x=group,y=(n)))+
  geom_boxplot()+
  facet_wrap(~type,ncol=4)
out_df%>%filter(grepl("d1c",type)|grepl("d1t",type))%>%filter(grepl("d1c",type))%>%group_by(group,type)%>%dplyr:::summarise(n=n())
#out_df
#out_df


out_df<-out_df%>%mutate(FOR_5bp=stri_reverse(substr(ten_sig_cont,1,5)))%>%
  mutate(BAK_5bp=substr(ten_sig_cont,7,11))
out_df%>%select(POS,REF,ALT,ten_bp_cont,ten_sig_cont,FOR_5bp,BAK_5bp)
out_df
tmp<-lapply(out_df$FOR_5bp,function(x){
  #  print(x)
  
  pattern2=paste0("^",substr(x,1,1),"+")
  #  print(pattern2)
  result<-gregexpr(pattern=pattern2,x)[[1]]
  data.frame("dir"="5bp","rep"=substr(x,1,1),length=attr(result,"match.length"))
  
})
tmp
#colnames(out_df)%>%length()
out_df1<-cbind(out_df,do.call(rbind,tmp))
out_df1

tmp2<-lapply(out_df$BAK_5bp,function(x){
  #  print(x)
  
  pattern2=paste0("^",substr(x,1,1),"+")
  #  print(pattern2)
  result<-gregexpr(pattern=pattern2,x)[[1]]
  data.frame("dir"="3bp","rep"=substr(x,1,1),length=attr(result,"match.length"))
  
})
out_df2<-cbind(out_df,do.call(rbind,tmp2))
out_df<-rbind(out_df1,out_df2)