library(dplyr)
library(tidyverse)
library(ggplot2)


ph_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/v3/",
                     "*cl_info.phasing.txt",
                     full.names=T)

ph_tmp<-lapply(ph_files,function(x){
  read_tsv(x)%>%mutate(id=gsub(".cl_info.*","",basename(x)))
})

ph_df<-do.call(rbind,ph_tmp)

A3A_cl_merge_f_br_sv_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/A3A_cl_sv_df.v3.txt")
A3A_cl_merge_f_br_sv_ph_df<-A3A_cl_merge_f_br_sv_df%>%left_join(ph_df)


A3A_cl_merge_f_br_sv_ph_df%>%
  select(id,`#CHROM`,cluster_id,POS,REF,ALT,sig_cont,cluster,phsBF,phsAF)%>%unique()%>%
  filter(id=="A3A_C3_TP53_C3_100ng-3",cluster_id=="2",`#CHROM`=="10")

#miss-phased cluster event###
#id                   `#CHROM` cluster_id cluster zero_count
#<chr>                <chr>    <chr>      <chr>        <dbl>
#1 A3A_1st_C3_3ug-2     12       14         omikli           1
#2 A3A_1st_C3_3ug-2     5        1          omikli           1
#3 A3A_C3_TP53_C3_3ug-3 4        5          omikli           1
#4 A3A_C3_TP53_C3_3ug-3 4        8          omikli           1






cl_sv_df<-read_tsv("/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/timing_clonality/v2/vcf/separate_clonal/pcawg_cl_br_info.txt",
                   col_types=c("cluster_id"="c"))

tot_br_df<-rbind(
  cl_sv_df%>%select(id,`#CHROM`,cluster,cluster_id,AMS,no_snv,min_br,max_br)%>%unique(),
  A3A_cl_merge_f_br_sv_df%>%select(id,`#CHROM`,cluster,cluster_id,AMS,no_snv,min_br,max_br)%>%unique()
)

sv_cor_cl_df<-rbind(cl_sv_df%>%filter(dist<1000)%>%dplyr::select(id,`#CHROM`,cluster,cluster_id,AMS,no_snv,sv_type,br,l_dist,r_dist)%>%unique(),
                    A3A_cl_merge_f_br_sv_df%>%filter(dist<1000)%>%dplyr::select(id,`#CHROM`,cluster,cluster_id,AMS,no_snv,sv_type,br,l_dist,r_dist)%>%unique())%>%
  mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))%>%
  unique()

sv_cor_cl_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/exclude_cl_mut_df.v2.txt",
              sep="\t",
              quote=F,
              row.names=F)

pcawg_discord_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/pcawg",
                                "seg.txt",
                                full.names=T)
A3A_discord_files<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/v3",
                              "seg.txt",
                              full.names=T)

pcawg_discord_tmp<-lapply(pcawg_discord_files,function(x){
  read_tsv(x,col_types=c("serial_info"="c"))%>%mutate(id=gsub(".cl_info.*","",basename(x)))
})
pcawg_discord_df<-do.call(rbind,pcawg_discord_tmp)
A3A_discord_tmp<-lapply(A3A_discord_files,function(x){
  read_tsv(x,col_types=c("serial_info"="c"))%>%mutate(id=gsub(".cl_info.*","",basename(x)))
})
A3A_discord_df<-do.call(rbind,A3A_discord_tmp)
A3A_discord_df%>%mutate(APOBEC="A3A",Project_code="A3A",sub_project_code="A3A")%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/v3/discordant_cluster.txt",
              sep="\t",
              quote=F,
              row.names=F)

merge_discord_df<-rbind(A3A_discord_df,pcawg_discord_df)




pcawg_APOBEC_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/fig/00_cl_ratio/pcawg_APOBEC_clonal_table.txt")%>%
  dplyr::select(id,APOBEC)
metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/target_sample.txt")%>%
  dplyr::select(-histology_abbreviation)%>%
  plyr::rename(c("Tumor_Sample_Barcode"="id"))

merge_discord_df<-left_join(merge_discord_df,left_join(pcawg_APOBEC_df,metadata))
merge_discord_df[is.na(merge_discord_df)]<-"A3A"
merge_discord_df<-merge_discord_df%>%mutate(sub_project_code=gsub("-.*","",Project_Code))



A3A_merge_discord_df<-merge_discord_df%>%
  filter(Project_Code=="A3A")%>%
  filter(cluster=="kataegis" | cluster=="omikli")




A3A_merge_discord_type_df<-A3A_merge_discord_df%>%
  mutate(seg_group=ifelse(segment>1,"multi_seg","seg"))%>%
  group_by(cluster,seg_group)%>%
  dplyr::summarise(n=n())%>%spread(seg_group,n)%>%
  cbind(overlap=c(4,8))%>%
  mutate(cor_seg=seg-overlap)%>%
  mutate(tot=multi_seg+cor_seg)%>%
  mutate(ratio_multi=multi_seg/tot,
         ratio_uni=cor_seg/tot)%>%
  select(cluster,ratio_multi,ratio_uni)%>%
  gather("type","ratio",ratio_multi:ratio_uni)

A3A_merge_discord_type_df<-A3A_merge_discord_type_df%>%
  mutate(type=ifelse(type=="ratio_multi","multi","uni"))
A3A_merge_discord_type_df$type<-factor(A3A_merge_discord_type_df$type,levels=c("uni","multi"))


A3A_merge_discord_sim_df<-A3A_merge_discord_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))%>%
  filter(!info%in%overlap_df$info)%>%
  mutate(seg_group=ifelse(segment>1,"multi_seg","seg"))%>%
  dplyr::group_by(cluster,segment)%>%
  dplyr::summarise(count=n())

A3A_merge_discord_sim_df$segment<-factor(A3A_merge_discord_sim_df$segment,levels=c(1,2,3,4,5,6))



write.table(merge_discord_df,"/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/30_clustered_mutation/discordant_cluster.txt",
            sep="\t",
            quote=F,
            row.names=F)


tot_br_df$cluster_id<-as.double(tot_br_df$cluster_id)
merge_discord_sv_df<-left_join(merge_discord_df,
                               tot_br_df)%>%
  mutate(dist=max_br-min_br+1)
merge_discord_sv_df<-merge_discord_sv_df%>%mutate(project_type=ifelse(Project_Code=="A3A","A3A","PCAWG"))
merge_discord_sv_df<-merge_discord_sv_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))
merge_discord_sv_df<-merge_discord_sv_df%>%mutate(sv_cor=ifelse(info%in%sv_cor_cl_df$info,"cor","free"))

merge_discord_sv_df%>%filter(cluster=="kataegis")%>%
  filter(project_type=="A3A")%>%
  filter(dist>10000)

merge_discord_sv_df<-merge_discord_sv_df%>%
  mutate(new_segment=ifelse(segment==2,ifelse((grepl("^1,",serial_info)|grepl(",1$",serial_info)),"2-1","2"),
                            ifelse(segment==1,"1","multiple")))



###exclude_sv##
###############

merge_discord_df<-merge_discord_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))
merge_discord_excl_df<-merge_discord_df%>%filter(!info%in%sv_cor_cl_df$info)

merge_discord_excl_df<-merge_discord_excl_df%>%filter(!info%in%(overlap_df%>%filter(standard==0))$info)

merge_discord_excl_count_df<-merge_discord_excl_df%>%
  group_by(sub_project_code,cluster,segment)%>%
  filter(!cluster=="other")%>%
  dplyr::summarise(n=n())%>%spread(segment,n)

merge_discord_excl_count_df[is.na(merge_discord_excl_count_df)]<-0



discord_table<-merge_discord_excl_df%>%filter(cluster=="kataegis")%>%
  filter(APOBEC=="A3A")%>%
  group_by(sub_project_code,segment)%>%
  dplyr::summarise(n=n())%>%
  spread(segment,n)

discord_table[is.na(discord_table)]<-0



discord_table2<-left_join(merge_discord_excl_df%>%filter(cluster=="kataegis")%>%
                            filter(APOBEC=="A3A")%>%filter(segment==1)%>%
                            group_by(sub_project_code)%>%
                            dplyr::summarise(same_count=n()),
                          
                          
                          merge_discord_excl_df%>%filter(cluster=="kataegis")%>%
                            filter(APOBEC=="A3A")%>%filter(segment>1)%>%
                            group_by(sub_project_code)%>%
                            dplyr::summarise(mixed_count=n())
                          
)

discord_table2[is.na(discord_table2)]<-0

discord_table2

