library(dplyr)
library(tidyverse)
library(ggplot2)
suppressMessages(library(optparse))
option_list <- list(
  #  make_option(c("-I", "--input_file"),
  #              help="Input annotated vcf(#CHROM,POS,REF,ALT,gene_dir required)"),
  make_option(c("-c", "--cond"),
              help="Input annotated vcf(#CHROM,POS,REF,ALT,gene_dir required)"),
  make_option(c("-r", "--rep"),
              help="Input annotated vcf(#CHROM,POS,REF,ALT,gene_dir required)"),
  make_option(c("-t", "--type"),
              help="Input annotated vcf(#CHROM,POS,REF,ALT,gene_dir required)"),
  make_option(c("-O", "--output_dir"),  default=".",
              help="output directory")
  
)
parser <- OptionParser(usage="%prog [options] file", option_list=option_list)


args <- parse_args(parser, positional_arguments = 0)
opt <- args$options
file <- args$args
print(opt$input_file)
print(opt$stranded_library)
#if(is.null(opt$input_file)){
#  print("check input file directory")
#  q()
#}

options(scipen=999)
metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/sig/metadata.txt")

#metadata
#cl_df<-read_tsv("/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")
cl_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/v2/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")
#cl_df<-read_tsv("/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/APOBEC_clustered_mutations/annotated/vaf_considered/APOBEC_clustered_mutations.all_samples.annotated.tsv")
#cl_df

files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
                          "excl.*gd$",
                          full.names=T)

#cl_df

#files_to_read<-list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter",
#                          ".cont$",
#                          full.names=T)
#files_to_read


vcf_tmp<-lapply(files_to_read,function(x){
  read_tsv(x,show_col_types = FALSE)%>%mutate(id=gsub(".mutect2.*","",basename(x)))%>%
    #select(id,`#CHROM`,POS,REF,ALT,sig_cont)
    select(`#CHROM`,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sig_cont,id)
  
})
merge_df<-do.call(rbind,vcf_tmp)
merge_df<-merge_df%>%filter(grepl("[ATGC][ATGC]>[ATGC][ATGC]",sig_cont))

merge_df<-left_join(merge_df,metadata%>%select(-`m/d`))

merge_df<-left_join(merge_df,cl_df%>%select(chr,start,samples,IMD,cluster_id,cluster_type_omikli_upto_3)%>%plyr::rename(c("chr"="#CHROM","start"="POS","samples"="id")))
merge_df<-merge_df%>%mutate(IMD=ifelse(is.na(IMD),0,IMD),cluster_id=ifelse(is.na(cluster_id),".",cluster_id),cluster=ifelse(is.na(cluster_type_omikli_upto_3),"non-clust",cluster_type_omikli_upto_3))%>%select(-cluster_type_omikli_upto_3)


A3A_cl_merge_df<-merge_df%>%filter(dose%in%c("3ug","100ng"))%>%filter(APOBEC=="A3A")

A3A_cl_merge_df<-A3A_cl_merge_df%>%mutate(APOBEC="A3A")
A3A_cl_merge_f_df<-left_join(
  A3A_cl_merge_df,
  A3A_cl_merge_df%>%group_by(id,`#CHROM`,APOBEC,dose,TP53,cluster_id,cluster)%>%dplyr::summarise(AMS=sum(grepl("TC>[GT][AT]",sig_cont)))%>%filter(cluster!="non-clust")
)%>%filter(AMS>=2)



###

A3A_cl_merge_f_df<-left_join(A3A_cl_merge_f_df,A3A_cl_merge_f_df%>%group_by(id,`#CHROM`,cluster_id,cluster)%>%dplyr::summarise(no_snv=n()))

ph_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/miss_phased_cluster.txt")%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))
sv_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/exclude_cl_mut_df.txt")%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))
#over_df<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/31_exclude_cluster/overlapped_cluster.v2.edit.txt")%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))

A3A_cl_merge_f_df<-A3A_cl_merge_f_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))



#exclude_df<-A3A_cl_merge_f_df%>%group_by(`#CHROM`,POS)%>%dplyr::summarise(n=n())%>%arrange(-n)%>%filter(n>1)%>%mutate(info=paste(`#CHROM`,POS,sep="_"))
#A3A_cl_merge_df%>%filter(POS==161259809)%>%select(id)

#A3A_cl_merge_f_df%>%mutate(info=paste(`#CHROM`,POS,sep="_"))%>%
  #filter(!info%in%exclude_df$info)%>%
#  filter(info%in%exclude_df2)

A3A_cl_merge_f_df<-A3A_cl_merge_f_df%>%
  filter(!info%in%ph_df$info)%>%
  filter(!info%in%sv_df$info)
#  filter(!info%in%over_df$info)

if(FALSE){
exclude_df2<-c("1_161259809",
               "10_75381008",
               "12_105634675",
               "3_61161296",
               "9_131103326",
               "9_131103609",
               "9_15693791",
               "X_117173181",
               "X_117174422",
               "X_117179969",
               "X_117179979",
               "X_117183217",
               "X_117189903",
               "X_117190393")
}
kat_df<-A3A_cl_merge_f_df%>%
  filter(!info%in%ph_df$info)%>%
  filter(!info%in%sv_df$info)%>%
 # filter(!info%in%over_df$info)%>%
  filter(cluster=="kataegis")%>%
  group_by(id,`#CHROM`,cluster_id)%>%
  dplyr::summarise(n=n())%>%filter(n>3)%>%
  select(-n)
omi_df<-A3A_cl_merge_f_df%>%
  filter(!info%in%ph_df$info)%>%
  filter(!info%in%sv_df$info)%>%
#  filter(!info%in%over_df$info)%>%
  filter(cluster=="omikli")%>%
  group_by(id,`#CHROM`,cluster_id)%>%
  dplyr::summarise(n=n())%>%filter(n>=2)%>%
  select(-n)

##

create_dir <- function(dir) {
  if (dir.exists(dir)) {
    cat('A directory',dir,'already exists.\n')
  } else {
    dir.create(dir,recursive=TRUE)
    cat('A new directory',dir,'has been created.\n')
  }
}

pcawg_std_df<-read_tsv("/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/discovery_simulation/add_simulation/00_cancer_files/link/pcwag_std.merge.vcf")
pcawg_std_df<-pcawg_std_df%>%mutate(info=paste(`#CHROM`,POS,sep="_"))
pcawg_std_df<-left_join(pcawg_std_df,pcawg_std_df%>%mutate(info=paste(`#CHROM`,POS,sep="_"))%>%
                          group_by(info)%>%
                          dplyr::summarise(n=n())
)
#sam_kat_df$info
pcawg_std_df<-pcawg_std_df%>%filter(n==1)%>%filter(`#CHROM`%in%c(as.character(c(1:22)),"X"))
pcawg_std_df$VAF<-sample(seq(0.4,0.6,by=0.001),nrow(pcawg_std_df),replace=TRUE)
##kataegis##
dir=opt$output_dir
create_dir(dir)
cond_vec<-c(as.numeric(opt$cond))
rep_vec<-c(as.numeric(opt$rep))

cond_vec
rep_vec
cl_type<-opt$type
if (cl_type=="kataegis") {
  for (cond in cond_vec){
    for (rep in rep_vec){
      sam_kat_df<-kat_df[sample(c(1:nrow(kat_df)),cond,replace=FALSE),]
      sam_kat_df<-sam_kat_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))
      
      #kat_std_df<-merge_df%>%mutate(info=paste(`#CHROM`,POS,sep="_"))%>%filter(!info%in%exclude_df2)%>%filter(!info%in%exclude_df$info)%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))%>%filter(info%in%sam_kat_df$info)%>%
      #  select(`#CHROM`,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,info)
      kat_std_df<-merge_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))%>%
        filter(!info%in%ph_df$info)%>%
        filter(!info%in%sv_df$info)%>%
#        filter(!info%in%over_df$info)%>%
        filter(cluster=="kataegis")%>%
        filter(info%in%sam_kat_df$info)%>%
        select(`#CHROM`,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,info)
      
      kat_nrow<-kat_std_df%>%nrow()
      kat_std_df<-cbind(kat_std_df%>%mutate(Clonality="clonal [NA]"),VAF=sample(seq(0.4,0.6,by=0.001),kat_nrow,replace=TRUE))%>%as.tibble()
      
      kat_std_df<-left_join(kat_std_df,A3A_cl_merge_f_df%>%mutate(info=paste(`#CHROM`,POS,sep="_"))%>%filter(cluster=="kataegis")%>%select(`#CHROM`,POS,cluster_id))%>%arrange(`#CHROM`,cluster_id)
      
      write.table(kat_std_df%>%unique(),
                  paste0(dir,"/std_kat.",cond,"-",rep,".tsv"),
                  sep="\t",
                  quote=F,
                  row.names=F)
      
      for (i in c(seq(2000,10000,by=2000),seq(15000,100000,by=5000),seq(110000,200000,by=10000))){
        for (j in c(1:50)){
          ext_num<-i-kat_nrow
          which<-sample(1:nrow(pcawg_std_df),ext_num,replace=FALSE)
          pcawg_add_df<-pcawg_std_df[which,]%>%arrange(`#CHROM`,POS)%>%select(-info,-n)
          info_merge_df<-rbind(kat_std_df%>%select(-cluster_id,-info),pcawg_add_df)%>%arrange(`#CHROM`,POS)
          write.table(info_merge_df%>%unique(),paste0(dir,"/",
                                           paste0("kataegis_sample_vcf_",cond,"-",rep,"_",i,"_",j),
                                           ".vcf"),
                      sep="\t",
                      quote=F,
                      row.names=F
          )
          
          
        }
      }
      
    }
  }
}
if (cl_type=="omikli") {
  for (cond in cond_vec){
    for (rep in rep_vec){
      
      sam_omi_df<-omi_df[sample(c(1:nrow(omi_df)),cond,replace=FALSE),]
      sam_omi_df<-sam_omi_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))
      
      omi_std_df<-merge_df%>%mutate(info=paste(id,`#CHROM`,cluster_id,sep="_"))%>%
        filter(!info%in%ph_df$info)%>%
        filter(!info%in%sv_df$info)%>%
      #  filter(!info%in%over_df$info)%>%
        filter(cluster=="omikli")%>%filter(info%in%sam_omi_df$info)%>%
        select(`#CHROM`,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,info)
      omi_nrow<-omi_std_df%>%nrow()
      omi_std_df<-cbind(omi_std_df%>%mutate(Clonality="clonal [NA]"),VAF=sample(seq(0.4,0.6,by=0.001),omi_nrow,replace=TRUE))%>%as.tibble()
      
      omi_std_df<-left_join(omi_std_df,A3A_cl_merge_f_df%>%mutate(info=paste(`#CHROM`,POS,sep="_"))%>%filter(cluster=="omikli")%>%select(`#CHROM`,POS,cluster_id))%>%arrange(`#CHROM`,cluster_id)
      
      write.table(omi_std_df%>%unique(),
                  paste0(dir,"/std_omi.",cond,"-",rep,".tsv"),
                  sep="\t",
                  quote=F,
                  row.names=F)
      
      for (i in c(seq(2000,10000,by=2000),seq(15000,100000,by=5000),seq(110000,200000,by=10000))){
        for (j in c(1:50)){
          ext_num<-i-omi_nrow
          which<-sample(1:nrow(pcawg_std_df),ext_num,replace=FALSE)
          pcawg_add_df<-pcawg_std_df[which,]%>%arrange(`#CHROM`,POS)%>%select(-info,-n)
          info_merge_df<-rbind(omi_std_df%>%select(-cluster_id,-info),pcawg_add_df)%>%arrange(`#CHROM`,POS)
          write.table(info_merge_df%>%unique(),paste0(dir,"/",
                                           paste0("omikli_sample_vcf_",cond,"-",rep,"_",i,"_",j),
                                           ".vcf"),
                      sep="\t",
                      quote=F,
                      row.names=F
          )
          
          
        }
      }
      
    }
  }
}



