library(dplyr)
library(tidyverse)
library(ggplot2)

metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")


cl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/v2/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")

files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
                          ".excl.*gd$",
                          full.names=T)


vcf_tmp<-lapply(files_to_read,function(x){
  read_tsv(x)%>%mutate(id=gsub(".mutect2.*","",basename(x)))%>%
    select(id,`#CHROM`,POS,REF,ALT,sig_cont)
  
})
merge_df<-do.call(rbind,vcf_tmp)
merge_df<-merge_df%>%filter(grepl("[ATGC][ATGC]>[ATGC][ATGC]",sig_cont))

merge_df<-left_join(merge_df,metadata%>%select(-`m/d`))

cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)
merge_df<-left_join(merge_df,cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)%>%plyr::rename(c("chr"="#CHROM","start"="POS","samples"="id")))
merge_df<-merge_df%>%mutate(IMD=ifelse(is.na(IMD),0,IMD),cluster_id=ifelse(is.na(cluster_id),".",cluster_id),cluster=ifelse(is.na(cluster_type_omikli_upto_3),"non-clust",cluster_type_omikli_upto_3))%>%select(-cluster_type_omikli_upto_3)

A3A_cl_merge_df<-merge_df%>%filter(dose%in%c("3ug","100ng"))%>%filter(APOBEC=="A3A")

A3A_cl_merge_df<-A3A_cl_merge_df%>%mutate(APOBEC="A3A")
A3A_cl_merge_f_df<-left_join(
  A3A_cl_merge_df,
  A3A_cl_merge_df%>%group_by(id,`#CHROM`,APOBEC,dose,TP53,cluster_id,cluster)%>%dplyr::summarise(AMS=sum(grepl("TC>[GT][AT]",sig_cont)))%>%filter(cluster!="non-clust")
)%>%filter(AMS>=2)

A3A_cl_merge_f_df<-left_join(A3A_cl_merge_f_df,A3A_cl_merge_f_df%>%group_by(id,`#CHROM`,cluster_id,cluster)%>%dplyr::summarise(no_snv=n()))


t<-A3A_cl_merge_df%>%filter(!cluster%in%c("dbs","other","non-clust"))%>%
  select(`#CHROM`,cluster_id,cluster,TP53,dose,IMD)%>%unique()

t<-t%>%mutate(info=paste0(TP53,"_",dose,"_",cluster))

t%>%arrange(dist)

t

peak_list<-lapply(t$info%>%unique(),function(k){
  print(k)
  DensityFaithfulX <- density(log10((t%>%filter(info==k))$IMD),adjust=0.4)$x
  DensityFaithfulY <- density(log10((t%>%filter(info==k))$IMD),adjust=0.4)$y
  DensityFaithfulX[which(diff(sign(diff(DensityFaithfulY,lag=1)))==-2)]
}
)

t$info
ft<-A3A_cl_merge_f_df%>%filter(!cluster%in%c("dbs","other","non-clust"))%>%
  select(`#CHROM`,cluster_id,cluster,TP53,dose,IMD)%>%unique()

ft<-ft%>%mutate(info=paste0(TP53,"_",dose,"_",cluster))
ft

peak_f_list<-lapply(ft$info%>%unique(),function(k){
  print(k)
  DensityFaithfulX <- density(log10((ft%>%filter(info==k))$IMD),adjust=0.4)$x
  DensityFaithfulY <- density(log10((ft%>%filter(info==k))$IMD),adjust=0.4)$y
  DensityFaithfulX[which(diff(sign(diff(DensityFaithfulY,lag=1)))==-2)]
}
)


peak_f_list
names(peak_list)<-t$info%>%unique()
names(peak_f_list)<-ft$info%>%unique()
peak_list
peak_df<-tibble(info=names(peak_list%>%unlist()),value=round(10**(peak_list%>%unlist()),0))%>%mutate(info=gsub("[0-9]$","",info))
peak_f_df<-tibble(info=names(peak_f_list%>%unlist()),value=round(10**(peak_f_list%>%unlist()),0))%>%mutate(info=gsub("[0-9]$","",info))
###dist_plot####
peak_list$i
A3A_cl_merge_df<-A3A_cl_merge_df%>%mutate(info=paste(TP53,dose,cluster,sep="_"))
A3A_cl_merge_f_df<-A3A_cl_merge_f_df%>%mutate(info=paste(TP53,dose,cluster,sep="_"))
for( i in t$info%>%unique()){
  
  
  
  p<-A3A_cl_merge_df%>%filter(!cluster%in%c("dbs","other","non-clust"))%>%
    select(id,`#CHROM`,cluster_id,cluster,TP53,dose,IMD,info)%>%unique()%>%mutate(APOBEC="A3A")%>%filter(info==i)%>%
    ggplot(aes(x=IMD,fill=cluster,col=APOBEC))+
    geom_histogram(aes(y=..density..))+
    geom_density(adjust=0.4,alpha=0.5)+
    #  facet_wrap(~TP53+cluster+dose,nrow=4,scales="free_y")+
    scale_color_manual(values="black")+
    geom_vline(xintercept = (peak_df%>%filter(info==i))$value)+
    scale_x_log10(breaks=c(c(1,10,100,1000),(peak_df%>%filter(info==i))$value),
                  labels=c(c(1,10,100,1000),(peak_df%>%filter(info==i))$value),
                  limit=c(0,10000))+
    theme(axis.text.x=element_text(size=20,angle=90))+
    ggtitle(i)
  p
  ggsave(paste0("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/IMD/",i,".png"),p)
}


peak_f_int_list<-lapply(c("kataegis","omikli"),function(k){
  print(k)
  DensityFaithfulX <- density(log10((ft%>%filter(cluster==k))$IMD),adjust=0.4)$x
  DensityFaithfulY <- density(log10((ft%>%filter(cluster==k))$IMD),adjust=0.4)$y
  DensityFaithfulX[which(diff(sign(diff(DensityFaithfulY,lag=1)))==-2)]
}
)

names(peak_f_int_list)<-c("kataegis","omikli")

peak_f_int_df<-tibble(info=names(peak_f_int_list%>%unlist()),value=round(10**(peak_f_int_list%>%unlist()),0))%>%mutate(info=gsub("[0-9]$","",info))

for( i in ft$info%>%unique()){
  
  
  
  p<-A3A_cl_merge_f_df%>%filter(!cluster%in%c("dbs","other","non-clust"))%>%
    select(id,`#CHROM`,cluster_id,cluster,TP53,dose,IMD,info)%>%unique()%>%mutate(APOBEC="A3A")%>%filter(info==i)%>%
    ggplot(aes(x=IMD,fill=cluster,col=APOBEC))+
    geom_histogram(aes(y=..density..))+
    geom_density(adjust=0.4,alpha=0.5)+
    #  facet_wrap(~TP53+cluster+dose,nrow=4,scales="free_y")+
    scale_color_manual(values="black")+
    geom_vline(xintercept = (peak_f_df%>%filter(info==i))$value)+
    scale_x_log10(breaks=c(c(1,10,100,1000,10000),(peak_f_df%>%filter(info==i))$value),
                  labels=c(c(1,10,100,1000,10000),(peak_f_df%>%filter(info==i))$value),
                  limits=c(1,10000)
    )+
    theme(axis.text.x=element_text(size=20,angle=90,hjust=1,vjust=0.5))+
    ggtitle(i)
  p
  ggsave(paste0("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/IMD/",i,".f.png"),p)
}

for( i in c("kataegis","omikli")){
  
  
  #i="kataegis"
  p<-A3A_cl_merge_f_df%>%filter(cluster%in%i)%>%
    select(id,`#CHROM`,cluster_id,cluster,TP53,dose,IMD,info)%>%unique()%>%mutate(APOBEC="A3A")%>%
    ggplot(aes(x=IMD,fill=cluster,col=APOBEC))+
    geom_histogram(aes(y=..density..))+
    geom_density(adjust=0.4,alpha=0.5)+
    #  facet_wrap(~TP53+cluster+dose,nrow=4,scales="free_y")+
    scale_color_manual(values="black")+
    geom_vline(xintercept = (peak_f_int_df%>%filter(info==i))$value)+
    scale_x_log10(breaks=c(c(1,10,100,1000,10000),(peak_f_int_df%>%filter(info==i))$value),
                  labels=c(c(1,10,100,1000,10000),(peak_f_int_df%>%filter(info==i))$value),
                  limits=c(1,10000)
    )+
    theme(axis.text.x=element_text(size=20,angle=90,hjust=1,vjust=0.5))+
    ggtitle(i)
  p
  ggsave(paste0("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/IMD/",i,".f.png"),p)
}


for( i in c("kataegis","omikli")){
  
  
  i="kataegis"
  i="h_kataegis"
  i="l_kataegis"
  A3A_cl_merge_f_df%>%filter(cluster%in%c("kataegis","omikli"))%>%mutate(cl_type=ifelse(cluster=="omikli","omikli",
                                                                                        ifelse(no_snv>9,"h_kataegis","l_kataegis")))%>%filter(cl_type%in%i)%>%
    select(id,`#CHROM`,cluster_id,cluster,TP53,dose,IMD)%>%unique()%>%mutate(APOBEC="A3A")%>%
    ggplot(aes(x=IMD,fill=cluster,col=APOBEC))+
    geom_histogram(aes(y=..density..))+
    geom_density(adjust=0.4,alpha=0.5)+
    #  facet_wrap(~TP53+cluster+dose,nrow=4,scales="free_y")+
    scale_color_manual(values="black")+
    scale_x_log10(breaks=c(c(1,10,100,1000,10000)),
                  labels=c(c(1,10,100,1000,10000)),
                  limits=c(1,10000)
    )+
    #geom_vline(xintercept = (peak_f_int_df%>%filter(info==i))$value)+
    #scale_x_log10(breaks=c(c(1,10,100,1000,10000),(peak_f_int_df%>%filter(info==i))$value),
    #              labels=c(c(1,10,100,1000,10000),(peak_f_int_df%>%filter(info==i))$value),
    #              limits=c(1,10000)
    #)+
    theme(axis.text.x=element_text(size=20,angle=90,hjust=1,vjust=0.5))+
    ggtitle(i)
  p
  ggsave(paste0("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/IMD/",i,".f.png"),p)
}



D<-(t%>%filter(info=="KO_3ug_kataegis"))$IMD
m1 <- FLXMRglm(family = "gaussian")
fit <- flexmix(D ~ 1, data = as.data.frame(D), k = 7, model = list(m1, m1,m1,m1,m1,m1,m1))

DensityFaithfulX <- density((t%>%filter(dose=="3ug"&TP53=="WT"&cluster=="kataegis"))$dist,adjust=0.5)$x
DensityFaithfulY <- density((t%>%filter(dose=="3ug"&TP53=="WT"&cluster=="kataegis"))$dist,adjust=0.5)$y
DensityFaithfulX[which(diff(sign(diff(DensityFaithfulY,lag=1)))==-2)]

x <- cumsum(cumsum(1:10))
diff(x, lag = 1)
diff(x, lag = 2)
diff(x, lag = 5)
diff(x, differences = 2)

library(flexmix)


D <-(t%>%filter(dose=="3ug"&TP53=="WT"&cluster=="kataegis"))$IMD
kde <- density(D)
m1 <- FLXMRglm(family = "gaussian")
m2 <- FLXMRglm(family = "gaussian")
fit <- flexmix(D ~ 1, data = as.data.frame(D), k = 7, model = list(m1, m1,m1))

for (i in c(1:7)){
  #  print(i)
  print(parameters(fit,component=i)[[1]])
}
c1 <- parameters(fit, component=1)[[1]]
c2 <- parameters(fit, component=2)[[1]]
c3 <- parameters(fit, component=3)[[1]]




abline(v=1, col='blue')
abline(v=c1[[1]], lty=2, col='blue')
abline(v=5, col='red')
abline(v=c2[[1]], lty=2, col='red')
abline(v=c3[[1]], lty=2, col='green')

sign(diff(DensityFaithfulY))



##sv_correlation##

cl_br_df<-cl_df%>%filter(cluster_type_omikli_upto_3%in%c("kataegis","omikli"))%>%
  group_by(samples,cluster_type_omikli_upto_3,cluster_id,chr)%>%
  dplyr::summarise(min_br=min(start),max_br=max(start))

cl_br_df<-cl_br_df%>%plyr::rename(c("chr"="#CHROM","samples"="id","cluster_type_omikli_upto_3"="cluster"))

delly_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/26_delly_clonal/original",
                        "*Fr.clonal.vcf",
                        full.names=T)
delly_files<-delly_files[grepl("A3A",delly_files)]

sv_tmp<-lapply(delly_files,function(x){
  read_tsv(x)%>%mutate(id=gsub(".delly.*","",basename(x)))
})

sv_merge_df<-do.call(rbind,sv_tmp)

sv_merge_df<-sv_merge_df%>%select(id,`#CHR1`,POS1,CHR2,POS2,svtype)

sv_merge_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/26_delly_clonal/original/sv_clonal.tsv")%>%select(id,`#CHR1`,POS1,CHR2,POS2,svtype)
sv_tmp_df1<-sv_merge_df%>%select(id,svtype,`#CHR1`,POS1)
sv_tmp_df2<-sv_merge_df%>%select(id,svtype,CHR2,POS2)

colnames(sv_tmp_df1)<-c("samples","sv_type","chr","br")
colnames(sv_tmp_df2)<-c("samples","sv_type","chr","br")

sv_df<-rbind(sv_tmp_df1,sv_tmp_df2)

sv_df$br<-as.double(sv_df$br)

sv_df<-sv_df%>%mutate(samples=gsub("_3-","_3ug-",samples))

sv_df<-sv_df%>%plyr::rename(c("samples"="id","chr"="#CHROM"))
A3A_cl_merge_f_br_df<-left_join(A3A_cl_merge_f_df,cl_br_df%>%mutate(cluster_id=as.character(cluster_id)))

write.table(A3A_cl_merge_f_br_df%>%unique(),"/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/cl_info.v3.txt",sep="\t",quote=F,row.names=F)

A3A_cl_merge_f_br_sv_df<-left_join(A3A_cl_merge_f_br_df,sv_df%>%plyr::rename(c("samples"="id","chr"="#CHROM")))%>%
  mutate(l_dist=ifelse(is.na(br),NA,abs(br-min_br)))%>%
  mutate(r_dist=ifelse(is.na(br),NA,abs(max_br-br)))%>%
  mutate(dist=ifelse(is.na(br),NA,abs(POS-br)))
A3A_excl_info<-A3A_cl_merge_f_br_sv_df%>%filter(dist<1000)%>%select(id,`#CHROM`,cluster_id,AMS,no_snv,sv_type,br,l_dist,r_dist)%>%unique()%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))

write.table(A3A_cl_merge_f_br_sv_df,"/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/v2/A3A_cl_sv_df.v3.txt",sep="\t",quote=F,row.names=F)


for (ids in ((A3A_cl_merge_f_br_df%>%select(id)%>%unique()))$id){
  print(ids)
  ids
  A3A_cl_merge_f_br_df%>%filter(no_snv>=9)%>%filter(id==ids)
  write.table(A3A_cl_merge_f_br_df%>%filter(id==ids)%>%unique(),
              paste0("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/v3/",ids,".cl_info.txt"),
              sep="\t",quote=F,row.names=F)
}
A3A_cl_merge_f_br_df%>%filter(no_snv>=9)%>%select(id,`#CHROM`,cluster_id,cluster,AMS,no_snv,min_br,max_br)%>%unique()
#A3A_cl_merge_f_br_df%.%

phs_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation",
                      "info.phasing.txt",
                      full.names=T)


phs_tmp<-lapply(phs_files,function(x){
  read_tsv(x)%>%mutate(id=gsub(".cl.*","",basename(x)))
})

phs_merge_df<-do.call(rbind,phs_tmp)

A3A_cl_merge_f_br_phs_df<-left_join(A3A_cl_merge_f_br_df,phs_merge_df)

A3A_cl_merge_f_br_phs_df<-left_join(
  A3A_cl_merge_f_br_phs_df,
  A3A_cl_merge_f_br_phs_df%>%
    group_by(id,`#CHROM`,cluster_id,phsBF)%>%
    dplyr::summarise(phs_count=n())
)
A3A_cl_merge_f_br_phs_df<-A3A_cl_merge_f_br_phs_df%>%left_join(A3A_cl_merge_f_br_phs_df%>%filter(phsBF=="2")%>%
                                                                 select(id,`#CHROM`,cluster_id,phs_count)%>%
                                                                 unique()%>%
                                                                 plyr::rename(c("phs_count"="no_info_count")))%>%
  mutate(no_info_count=ifelse(is.na(no_info_count),0,no_info_count))%>%
  mutate(no_snv_phs=no_snv-1-no_info_count)%>%mutate(phs_ratio=phs_count/(no_snv_phs))

A3A_cl_merge_f_br_phs_df$no_snv_phs<-factor(A3A_cl_merge_f_br_phs_df$no_snv_phs,levels=(A3A_cl_merge_f_br_phs_df%>%arrange(no_snv_phs))$no_snv_phs%>%unique())





##context count###
library(GenomicRanges)
library(Rsamtools)



range_df<-A3A_cl_merge_f_br_phs_df%>%filter(cluster!="other")%>%select(id,`#CHROM`,cluster_id,min_br,max_br)%>%unique()


fasta_file<-FaFile(file='/home/users/ayh/Projects/reference/genome/human/GRCh37/A3B/human_g1k_v37.rtTA.A3B_mcherry_vec.fa')


p_df<-GRanges(seqnames=range_df$`#CHROM`,IRanges(start=range_df$min_br-1, end=range_df$max_br+1),strand="+")
n_df<-GRanges(seqnames=range_df$`#CHROM`,IRanges(start=range_df$min_br-1, end=range_df$max_br+1),strand="-")

p_refbase<-getSeq(fasta_file,p_df)
p_refbase<-as.data.frame(p_refbase)$x

n_refbase<-getSeq(fasta_file,n_df)
n_refbase<-as.data.frame(n_refbase)$x



range_df<-cbind(range_df,pos_count=str_count(p_refbase,paste0("(?=","TC[ACGT]",")")),neg_count=str_count(n_refbase,paste0("(?=","TC[ACGT]",")")))
A3A_cl_merge_f_br_phs_back_df<-left_join(A3A_cl_merge_f_br_phs_df%>%filter(cluster!="other"),range_df)
A3A_cl_merge_f_br_phs_back_df<-left_join(A3A_cl_merge_f_br_phs_back_df,
                                         A3A_cl_merge_f_br_phs_back_df%>%dplyr::group_by(id,`#CHROM`,cluster_id)%>%
                                           dplyr::summarise(TCN_count=length(sig_cont[(grepl("TC>",sig_cont))])))%>%
  mutate(nonTCN_count=no_snv-TCN_count)

A3A_cl_merge_f_br_phs_back_df<-A3A_cl_merge_f_br_phs_back_df%>%mutate(TCN_ratio=ifelse(REF%in%c("C","T"),TCN_count/pos_count,TCN_count/neg_count))

t_df<-A3A_cl_merge_f_br_phs_back_df%>%select(id,`#CHROM`,REF,ALT,sig_cont,cluster_id,cluster,no_snv,pos_count,neg_count,TCN_count,nonTCN_count,TCN_ratio)


A3A_cl_merge_f_br_phs_back_df<-left_join(A3A_cl_merge_f_br_phs_back_df,
                                         A3A_cl_merge_f_br_phs_back_df%>%group_by(id,`#CHROM`,cluster_id)%>%
                                           dplyr::summarise(mut_pos_count=length(REF[(REF%in%c("C","T"))]),
                                                            mut_neg_count=length(REF[(REF%in%c("G","A"))]))
)


A3A_cl_merge_f_br_phs_back_df<-A3A_cl_merge_f_br_phs_back_df%>%mutate(mut_tot_count=mut_pos_count+mut_neg_count)
A3A_cl_merge_f_br_phs_back_df<-A3A_cl_merge_f_br_phs_back_df%>%mutate(mut_pos_ratio=mut_pos_count/mut_tot_count)%>%
  mutate(mut_neg_ratio=mut_neg_count/mut_tot_count)


A3A_cl_merge_f_br_phs_back_df<-left_join(A3A_cl_merge_f_br_phs_back_df%>%ungroup(),A3A_cl_merge_f_br_phs_back_df%>%dplyr::group_by(id,`#CHROM`,cluster_id,TCN_count)%>%
                                           dplyr::summarise(TCN_pos_count=sum(grepl("TC>[AGT][ACGT]",sig_cont)&REF%in%c("C","T")),
                                                            TCN_neg_count=sum(grepl("TC>[AGT][ACGT]",sig_cont)&REF%in%c("G","A")),
                                                            nonTCN_pos_count=sum(grepl("[AG]C>[AGT][ACGT]",sig_cont)&REF%in%c("C","T")),
                                                            nonTCN_neg_count=sum(grepl("[AG]C>[AGT][ACGT]",sig_cont)&REF%in%c("G","A")),
                                                            nonC_count=sum(grepl("T>",sig_cont))
                                           )
)

A3A_cl_merge_f_br_phs_back_df%>%mutate(class=ifelse(grepl("TC>[AGT][ACGT]",sig_cont),"TCN","nonTCN"))%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/Fig3/cl_mutation_table.txt",
              sep="\t",
              quote=F,
              row.names=F)

A3A_cl_merge_f_br_phs_back_df$no_snv<-as.double(A3A_cl_merge_f_br_phs_back_df$no_snv)


A3A_cl_merge_f_br_phs_back_df<-A3A_cl_merge_f_br_phs_back_df%>%mutate(TCN_mut_pos_ratio=TCN_pos_count/no_snv,
                                                                      TCN_mut_neg_ratio=TCN_neg_count/no_snv,
                                                                      nonTCN_pos_ratio=nonTCN_pos_count/no_snv,
                                                                      nonTCN_neg_ratio=nonTCN_neg_count/no_snv,
                                                                      nonC_ratio=nonC_count/no_snv)

A3A_cl_merge_f_br_phs_back_df$no_snv<-factor(A3A_cl_merge_f_br_phs_back_df$no_snv,levels=(A3A_cl_merge_f_br_phs_back_df%>%arrange(no_snv))$no_snv%>%unique())





A3A_cl_merge_f_br_phs_back_df$no_snv<-as.double(A3A_cl_merge_f_br_phs_back_df$no_snv)



A3A_cl_merge_f_br_phs_back_df<-A3A_cl_merge_f_br_phs_back_df%>%mutate(main_strand=ifelse(TCN_pos_count>TCN_neg_count,"pos",
                                                                                         ifelse(TCN_pos_count<TCN_neg_count,"neg",".")))
A3A_cl_merge_f_br_phs_back_df<-A3A_cl_merge_f_br_phs_back_df%>%mutate(TCN_ratio=ifelse(main_strand=="pos",TCN_pos_count/pos_count,
                                                                                       ifelse(main_strand=="neg",TCN_neg_count/neg_count,NA)))


##merge_all##

seg_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation",
                      "seg.txt",
                      full.names=T)
dist_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation",
                       "dist.txt",
                       full.names=T)



seg_tmp<-lapply(seg_files,function(x){
  read_tsv(x)%>%mutate(id=gsub(".cl_info.*","",basename(x)))
})


dist_tmp<-lapply(dist_files,function(x){
  read_tsv(x)%>%mutate(id=gsub(".cl_info.*","",basename(x)))
})

seg_df<-do.call(rbind,seg_tmp)
dist_df<-do.call(rbind,dist_tmp)

seg_df$cluster_id<-as.character(seg_df$cluster_id)
dist_df$cluster_id<-as.character(dist_df$cluster_id)



A3A_cl_merge_f_br_phs_back_dist_seg_df<-A3A_cl_merge_f_br_phs_back_df%>%left_join(seg_df)%>%left_join(dist_df)

A3A_cl_merge_f_br_phs_back_dist_seg_df$basestat<-factor(A3A_cl_merge_f_br_phs_back_dist_seg_df$basestat,levels=c(0,1))



A3A_cl_merge_f_br_phs_back_dist_seg_df<-A3A_cl_merge_f_br_phs_back_dist_seg_df%>%mutate(info=paste0(dose,"_",cluster,"_",TP53,"_",basestat))


base_peak_list<-lapply(A3A_cl_merge_f_br_phs_back_dist_seg_df$info%>%unique(),function(k){
  print(k)
  DensityFaithfulX <- density(log10((A3A_cl_merge_f_br_phs_back_dist_seg_df%>%filter(dist!=0)%>%group_by(id,APOBEC,TP53,dose,cluster,`#CHROM`,cluster_id,basestat)%>%
                                       dplyr::summarise(med_dist=mean(dist))%>%mutate(info=paste0(dose,"_",cluster,"_",TP53,"_",basestat))%>%filter(info==k))$med_dist),adjust=0.4)$x
  DensityFaithfulY <- density(log10((A3A_cl_merge_f_br_phs_back_dist_seg_df%>%filter(dist!=0)%>%group_by(id,APOBEC,TP53,dose,cluster,`#CHROM`,cluster_id,basestat)%>%
                                       dplyr::summarise(med_dist=mean(dist))%>%mutate(info=paste0(dose,"_",cluster,"_",TP53,"_",basestat))%>%filter(info==k))$med_dist),adjust=0.4)$y
  DensityFaithfulX[which(diff(sign(diff(DensityFaithfulY,lag=1)))==-2)]
}
)


names(base_peak_list)<-A3A_cl_merge_f_br_phs_back_dist_seg_df$info%>%unique()

base_peak_df<-tibble(info=names(base_peak_list%>%unlist()),value=round(10**(base_peak_list%>%unlist()),0))%>%mutate(info=gsub("[0-9]$","",info))



for( i in A3A_cl_merge_f_br_phs_back_dist_seg_df$info%>%unique()){
  
  
  
  p<-A3A_cl_merge_f_br_phs_back_dist_seg_df%>%filter(dist!=0)%>%group_by(id,APOBEC,TP53,dose,cluster,`#CHROM`,cluster_id,basestat)%>%
    dplyr::summarise(med_dist=mean(dist))%>%mutate(info=paste0(dose,"_",cluster,"_",TP53,"_",basestat))%>%ungroup()%>%filter(info==i)%>%
    ggplot(aes(x=med_dist))+
    geom_histogram(aes(y=..density..))+
    geom_density(adjust=0.4,alpha=0.5)+
    #  facet_wrap(~TP53+cluster+dose,nrow=4,scales="free_y")+
    scale_color_manual(values="black")+
    geom_vline(xintercept = (base_peak_df%>%filter(info==i))$value)+
    scale_x_log10(breaks=c(c(1,10,100,1000,10000),(base_peak_df%>%filter(info==i))$value),
                  labels=c(c(1,10,100,1000,10000),(base_peak_df%>%filter(info==i))$value),
                  limits=c(1,10000)
    )+
    theme(axis.text.x=element_text(size=20,angle=90,hjust=1,vjust=0.5))+
    ggtitle(i)
  p
  ggsave(paste0("/home/users/ayh/Projects/27_A3B/06_Figure_code/edit_figure/Fig3/IMD/",i,".base.png"),p)
}



#####C>T/G ratio

A3A_cl_merge_f_ratio_df<-left_join(A3A_cl_merge_f_df%>%mutate(mut_type=substr(sig_cont,2,4))%>%
                                     group_by(id,`#CHROM`,cluster_id,cluster,mut_type)%>%
                                     dplyr::summarise(type_count=n()),
                                   A3A_cl_merge_f_df%>%select(id,`#CHROM`,cluster_id,cluster,APOBEC,TP53,dose,no_snv)%>%unique())%>%
  spread(mut_type,type_count)
A3A_cl_merge_f_ratio_df[is.na(A3A_cl_merge_f_ratio_df)]<-0
A3A_cl_merge_f_ratio_df<-A3A_cl_merge_f_ratio_df%>%gather(mut_type,count,`C>A`:`T>G`)
A3A_cl_merge_f_ratio_df<-A3A_cl_merge_f_ratio_df%>%mutate(ratio=count/no_snv)
A3A_cl_merge_f_ratio_df$no_snv<-factor(A3A_cl_merge_f_ratio_df$no_snv,levels=((A3A_cl_merge_f_ratio_df%>%arrange(no_snv))$no_snv)%>%unique())



A3A_noncl_ratio_df<-left_join(A3A_cl_merge_df%>%filter(cluster=="non-clust")%>%
                                filter(grepl("TC>",sig_cont))%>%
                                mutate(mut_type=substr(sig_cont,2,4))%>%
                                group_by(id,dose,TP53,mut_type)%>%
                                dplyr::summarise(n=n()),
                              
                              A3A_cl_merge_df%>%filter(cluster=="non-clust")%>%
                                filter(grepl("TC>",sig_cont))%>%
                                mutate(mut_type=substr(sig_cont,2,4))%>%
                                group_by(id,dose,TP53)%>%
                                dplyr::summarise(tot_n=n())
)

###region###

files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/28_genomic_feature/05_vcf/annotation_total",
                          "bed$",
                          full.names = T)

tmp<-lapply(files_to_read,function(x){
  read.csv(x,head=FALSE,sep="\t")%>%
    mutate(id=gsub("\\..*","",basename(x)))%>%
    mutate(seq=gsub(".edit.*","",gsub(".bed","",gsub("^.*tot.","",basename((x))))))%>%
    plyr::rename(c("V1"="#CHROM","V2"="r1","V3"="r2","V4"="bin"))
})

merge_df<-do.call(rbind,tmp)%>%as.tibble()

merge_df<-merge_df%>%spread(seq,bin)




merge_sim_df<-merge_df%>%dplyr::select(`#CHROM`,r1,r2,id,RepliSeq,DHS,H3K36me3,H3K9ac,RNAseq,H3K9me3,H4K20me1)

merge_sim_sim_df<-merge_sim_df%>%select(-r2)%>%
  plyr::rename(c("r1"="POS"))


A3A_cl_merge_f_epi_df<-left_join(A3A_cl_merge_f_df,merge_sim_sim_df)
A3A_cl_merge_f_epi_df$RepliSeq<-as.double(A3A_cl_merge_f_epi_df$RepliSeq)
A3A_cl_merge_f_epi_df$no_snv<-factor(A3A_cl_merge_f_epi_df$no_snv,levels=((A3A_cl_merge_f_epi_df%>%arrange(no_snv))$no_snv)%>%unique())


A3A_cl_merge_f_epi_Rep_df<-A3A_cl_merge_f_epi_df%>%select(id,sig_cont,APOBEC,dose,TP53,`#CHROM`,cluster_id,cluster,no_snv,RepliSeq)
A3A_cl_merge_f_epi_Rep_df<-A3A_cl_merge_f_epi_Rep_df%>%group_by(id,APOBEC,dose,TP53,`#CHROM`,cluster_id,no_snv)%>%dplyr::summarise(rep_n=median(RepliSeq))

