library(dplyr)
library(tidyverse)
library(ggplot2)
phs_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/v3",
                      "phasing.txt",
                      full.names=T)
phs_tmp<-lapply(phs_files,function(x){
  read_tsv(x)%>%mutate(id=gsub(".cl.*","",basename(x)))
})

phs_merge_df<-do.call(rbind,phs_tmp)


metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")

#metadata
cl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/v2/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")
#cl_df

files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
                          ".excl.*gd$",
                          full.names=T)
files_to_read


vcf_tmp<-lapply(files_to_read,function(x){
  read_tsv(x)%>%mutate(id=gsub(".mutect2.*","",basename(x)))%>%
    select(id,`#CHROM`,POS,REF,ALT,sig_cont)
  
})
merge_df<-do.call(rbind,vcf_tmp)
merge_df<-merge_df%>%filter(grepl("[ATGC][ATGC]>[ATGC][ATGC]",sig_cont))

merge_df<-left_join(merge_df,metadata%>%select(-`m/d`))

cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)
merge_df<-left_join(merge_df,cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)%>%plyr::rename(c("chr"="#CHROM","start"="POS","samples"="id")))
merge_df<-merge_df%>%mutate(IMD=ifelse(is.na(IMD),0,IMD),cluster_id=ifelse(is.na(cluster_id),".",cluster_id),cluster=ifelse(is.na(cluster_type_omikli_upto_3),"non-clust",cluster_type_omikli_upto_3))%>%select(-cluster_type_omikli_upto_3)

merge_df$dose%>%unique()

A3A_cl_merge_df<-merge_df%>%filter(dose%in%c("3ug","100ng"))%>%filter(APOBEC=="A3A")

A3A_cl_merge_df<-A3A_cl_merge_df%>%mutate(APOBEC="A3A")
A3A_cl_merge_f_df<-left_join(
  A3A_cl_merge_df,
  A3A_cl_merge_df%>%group_by(id,`#CHROM`,APOBEC,dose,TP53,cluster_id,cluster)%>%dplyr::summarise(AMS=sum(grepl("TC>[GT][AT]",sig_cont)))%>%filter(cluster!="non-clust")
)%>%filter(AMS>=2)

A3A_cl_merge_f_df<-left_join(A3A_cl_merge_f_df,A3A_cl_merge_f_df%>%group_by(id,`#CHROM`,cluster_id,cluster)%>%dplyr::summarise(no_snv=n()))

A3A_cl_merge_f_phs_df<-left_join(A3A_cl_merge_f_df,phs_merge_df)
A3A_cl_merge_f_phs_count_df<-A3A_cl_merge_f_phs_df%>%
  group_by(id,`#CHROM`,cluster_id,cluster,phsBF)%>%
  dplyr::summarise(n=n())%>%
  spread(phsBF,n)


A3A_cl_merge_f_phs_count_df[is.na(A3A_cl_merge_f_phs_count_df)]<-0


A3A_cl_merge_f_phs_count_df%>%filter(`0`>0)%>%ungroup()%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))%>%
  select(id,`#CHROM`,cluster_id,info)%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/miss_phased_cluster.txt",
              row.names=F,
              sep="\t",
              quote=F)

