# Joonoh Lim (joonoh.lim@kaist.ac.kr)
# -----------------------------------
# First created on 2022-04-24
# Last modified on 2022-04-24,25,28
# ------------------------------------------------------------------------------
library(data.table)
library(magrittr)
library(gtools)
library(pbmcapply)
library(BSgenome.Hsapiens.Ensembl.GRCh37.jolim)
library(ggplot2)
library(ggbreak)
# library(ggthemes)
library(lemon)
library(scales)
library(flexmix) # flexmix
# library(mixtools) # poisregmixEM
# library(rebmix)
# ------------------------------------------------------------------------------

# Helper functions -------------------------------------------------------------
get_imd <- function(a) {
  # This computes the intramutation distance between the nearest neighbors.
  l <- data.table::shift(a,-1) - a
  r <- a - data.table::shift(a,1)
  l[length(l)] <- r[length(r)]
  r[1] <- l[1]
  x <- l
  x[r < l] <- r[r < l]
  x
}
# For test,
# a <- x[Chrom=='1',POS]


assign_cluster <- function(pos,imd_cutoff) {
  # This is a time consuming function
  imd_cutoff <- imd_cutoff[1]
  r <- pos - data.table::shift(pos,1) # consecutive imd
  r[1] <- 0
  cluster_assign <- vector('character',length(r))
  cluster_starts <- c(1,which(r - imd_cutoff > 0))
  cluster_ends <- data.table::shift(cluster_starts,-1) - 1
  cluster_ends[length(cluster_ends)] <- length(r)
  cluster_num <- 1
  for (i in 1:length(cluster_starts)) {
    cluster_assign[cluster_starts[i]:cluster_ends[i]] <- cluster_num
    cluster_num <- cluster_num + 1
  }
  # cluster_num <- 1
  # s <- 1
  # r <- c(r,Inf)
  # while (TRUE) {
  #     if (cluster_num=='17') {
  #         break
  #     }
  #     x <- cumsum(r)
  #     y <- x > imd_cutoff
  #     # left boundary case
  #     if (all(y)==TRUE) {
  #         cluster_assign[1:length(r)] <- cluster_num
  #         break
  #     }
  #     i <- which(y)[1]
  #     e <- i - 1
  #     cluster_assign[s:e] <- cluster_num
  #     cluster_num <- cluster_num + 1
  #     r[s:i] <- 0
  #     s <- i
  #     # Halting condition
  #     if (sum(r)==0) break
  # }
  return(cluster_assign)
}
# For test,
# pos <- dat[chr=='2',start]
# imd_cutoff <- dat$imd_cutoff[1]
# assign_cluster(pos,imd_cutoff)


classify_substitution_type <- function(imd,vaf,vaf_diff_cutoff) {
  # Types of substitution: sbs, dbs, mbs
  vaf_diff_cutoff <- vaf_diff_cutoff[1]
  subs_type <- vector('character',length(imd))
  imd[abs((mean(vaf) - vaf)) > vaf_diff_cutoff] <- Inf
  imd_1s <- as.integer(imd == 1)
  r <- rle(as.character(imd_1s))$lengths %>% rep(.,times=.)
  r <- r * imd_1s
  subs_type[r <= 1] <- 'sbs'
  subs_type[r == 2] <- 'dbs'
  subs_type[r > 2] <- 'mbs'
  return(subs_type)
}
# For test,
# vaf <- c(1,1,1,1,1,2,1,1,1,1,1,1,2,1,1,1)
# imd <- c(1,1,2,1,1,1,1,2,2,1,1,2,2,2,1,1)
# expected: dbs, dbs, sbs, dbs, dbs, sbs, sbs, sbs, sbs, dbs, dbs, sbs, sbs, sbs, dbs, dbs


classify_cluster_type <- function(imd,subs_type,vaf,vaf_diff_cutoff,omikli_upto_n_muts=3) {
  # Types of cluster: dbs, omikli, kataegis
  vaf_diff_cutoff <- vaf_diff_cutoff[1]
  
  # Sanity check
  if (omikli_upto_n_muts < 2) {
    stop("'omikli_upto_n_muts' should be greater than 1.")
  }
  
  # Other
  cluster_type <- vector('character',length(imd))
  idx_for_other <- abs((mean(vaf) - vaf)) > vaf_diff_cutoff
  cluster_type[idx_for_other] <- "other"
  subs_type[idx_for_other] <- "other"
  
  n_sbs <- sum(subs_type == 'sbs')
  n_dbs <- sum(subs_type[cumsum(rle(subs_type)$lengths)] == 'dbs')
  n_mbs <- sum(subs_type[cumsum(rle(subs_type)$lengths)] == 'mbs')
  
  # omikli or kataegis
  n_clustered_muts <- n_sbs + n_dbs + n_mbs
  if (n_clustered_muts == 0) {
    # Do nothing
    # This is the case where all mutations are other
  }
  if (n_clustered_muts == 1) {
    # Case 1: (when ignoring vaf) 1 non-other and it is either dbs or mbs
    if (n_dbs != 0) {
      cluster_type[cluster_type==''] <- 'dbs'
    }
    if (n_mbs != 0) {
      if (all(cluster_type!='')) {
        stop("Something went wrong with 'cluster_type'")
      }
      cluster_type[cluster_type==''] <- 'mbs'
    }
    # Case 2: (considering vaf), 1 non-other and n other.
    if (n_dbs == 0 & n_mbs == 0) {
      cluster_type[cluster_type==''] <- 'other'
    } # 2022-05-01 patched
  } else if (n_clustered_muts <= omikli_upto_n_muts) {
    cluster_type[cluster_type==''] <- 'omikli'
  } else if (n_clustered_muts > omikli_upto_n_muts) {
    cluster_type[cluster_type==''] <- 'kataegis'
  } else {
    stop('Can it ever reach here?')
  }
  return(cluster_type)
}
# For test,
# 0:
# imd <- c(0,0)
# imd <- c(1,1,2,1,1,1,1,2,2,1,1,2,2,2,1,1)
# 1:
# imd <- dat[chr=='1' & cluster_id == '1',imd]
# cluster_id <- dat[chr=='1' & cluster_id == '1',cluster_id]
# vaf <- dat[chr=='1' & cluster_id == '1',VAF]
# 2:
# imd <- dat[chr=='2' & cluster_id == '3',imd]
# cluster_id <- dat[chr=='2' & cluster_id == '3',cluster_id]
# vaf <- dat[chr=='2' & cluster_id == '3',VAF]
# 3:
# x <- out_all_in_one[samples=='A3A_1st_C3_3ug-2' & chr == '22']
# x <- dat[samples=='A3A_1st_C3' & chr == '1' & cluster_id == '8']
# imd <- x$imd
# vaf <- x$VAF
# subs_type <- x$subs_type
# vaf_diff_cutoff <- 0.1


create_dir <- function(dir) {
  if (dir.exists(dir)) {
    cat('A directory',dir,'already exists.\n')
  } else {
    dir.create(dir,recursive=TRUE)
    cat('A new directory',dir,'has been created.\n')
  }
}


annotate_sequence_context <- function(dt,
                                      n_flanking_bases_5p=1,
                                      n_flanking_bases_3p=1,
                                      chr_name='chr',
                                      pos_name='start',
                                      genome=BSgenome.Hsapiens.Ensembl.GRCh37.jolim){
  gr_5p <- GRanges(seqnames=dt[[chr_name]],ranges=IRanges(start=dt[[pos_name]] - n_flanking_bases_5p,end=dt[[pos_name]] - 1))
  gr_3p <- GRanges(seqnames=dt[[chr_name]],ranges=IRanges(start=dt[[pos_name]] + 1,end=dt[[pos_name]] + n_flanking_bases_3p))
  seqCtx_5p <- getSeq(genome,gr_5p)
  seqCtx_3p <- getSeq(genome,gr_3p)
  dt[,seqCtx_5p:=as.vector(seqCtx_5p)]
  dt[,seqCtx_3p:=as.vector(seqCtx_3p)]
}
# For test,
# dt = out[[1]]

if (FALSE) {
  # ------------------------------------------------------------------------------
  # NOTE: 2022-04-28,
  # Setup# Joonoh Lim (joonoh.lim@kaist.ac.kr)
  # -----------------------------------
  # First created on 2022-04-24
  # Last modified on 2022-04-24,25,28
  # ------------------------------------------------------------------------------
  library(data.table)
  library(magrittr)
  library(gtools)
  library(pbmcapply)
  library(BSgenome.Hsapiens.Ensembl.GRCh37.jolim)
  library(ggplot2)
  library(ggbreak)
  # library(ggthemes)
  library(lemon)
  library(scales)
  library(flexmix) # flexmix
  # library(mixtools) # poisregmixEM
  # library(rebmix)
  # ------------------------------------------------------------------------------
  
  # Helper functions -------------------------------------------------------------
  get_imd <- function(a) {
    # This computes the intramutation distance between the nearest neighbors.
    l <- data.table::shift(a,-1) - a
    r <- a - data.table::shift(a,1)
    l[length(l)] <- r[length(r)]
    r[1] <- l[1]
    x <- l
    x[r < l] <- r[r < l]
    x
  }
  # For test,
  # a <- x[Chrom=='1',POS]
  
  
  assign_cluster <- function(pos,imd_cutoff) {
    # This is a time consuming function
    imd_cutoff <- imd_cutoff[1]
    r <- pos - data.table::shift(pos,1) # consecutive imd
    r[1] <- 0
    cluster_assign <- vector('character',length(r))
    cluster_starts <- c(1,which(r - imd_cutoff > 0))
    cluster_ends <- data.table::shift(cluster_starts,-1) - 1
    cluster_ends[length(cluster_ends)] <- length(r)
    cluster_num <- 1
    for (i in 1:length(cluster_starts)) {
      cluster_assign[cluster_starts[i]:cluster_ends[i]] <- cluster_num
      cluster_num <- cluster_num + 1
    }
    # cluster_num <- 1
    # s <- 1
    # r <- c(r,Inf)
    # while (TRUE) {
    #     if (cluster_num=='17') {
    #         break
    #     }
    #     x <- cumsum(r)
    #     y <- x > imd_cutoff
    #     # left boundary case
    #     if (all(y)==TRUE) {
    #         cluster_assign[1:length(r)] <- cluster_num
    #         break
    #     }
    #     i <- which(y)[1]
    #     e <- i - 1
    #     cluster_assign[s:e] <- cluster_num
    #     cluster_num <- cluster_num + 1
    #     r[s:i] <- 0
    #     s <- i
    #     # Halting condition
    #     if (sum(r)==0) break
    # }
    return(cluster_assign)
  }
  # For test,
  # pos <- dat[chr=='2',start]
  # imd_cutoff <- dat$imd_cutoff[1]
  # assign_cluster(pos,imd_cutoff)
  
  
  classify_substitution_type <- function(imd,vaf,vaf_diff_cutoff) {
    # Types of substitution: sbs, dbs, mbs
    vaf_diff_cutoff <- vaf_diff_cutoff[1]
    subs_type <- vector('character',length(imd))
    imd[abs((mean(vaf) - vaf)) > vaf_diff_cutoff] <- Inf
    imd_1s <- as.integer(imd == 1)
    r <- rle(as.character(imd_1s))$lengths %>% rep(.,times=.)
    r <- r * imd_1s
    subs_type[r <= 1] <- 'sbs'
    subs_type[r == 2] <- 'dbs'
    subs_type[r > 2] <- 'mbs'
    return(subs_type)
  }
  # For test,
  # vaf <- c(1,1,1,1,1,2,1,1,1,1,1,1,2,1,1,1)
  # imd <- c(1,1,2,1,1,1,1,2,2,1,1,2,2,2,1,1)
  # expected: dbs, dbs, sbs, dbs, dbs, sbs, sbs, sbs, sbs, dbs, dbs, sbs, sbs, sbs, dbs, dbs
  
  
  classify_cluster_type <- function(imd,subs_type,vaf,vaf_diff_cutoff,omikli_upto_n_muts=3) {
    # Types of cluster: dbs, omikli, kataegis
    vaf_diff_cutoff <- vaf_diff_cutoff[1]
    
    # Sanity check
    if (omikli_upto_n_muts < 2) {
      stop("'omikli_upto_n_muts' should be greater than 1.")
    }
    
    # Other
    cluster_type <- vector('character',length(imd))
    idx_for_other <- abs((mean(vaf) - vaf)) > vaf_diff_cutoff
    cluster_type[idx_for_other] <- "other"
    subs_type[idx_for_other] <- "other"
    
    n_sbs <- sum(subs_type == 'sbs')
    n_dbs <- sum(subs_type[cumsum(rle(subs_type)$lengths)] == 'dbs')
    n_mbs <- sum(subs_type[cumsum(rle(subs_type)$lengths)] == 'mbs')
    
    # omikli or kataegis
    n_clustered_muts <- n_sbs + n_dbs + n_mbs
    if (n_clustered_muts == 0) {
      # Do nothing
      # This is the case where all mutations are other
    }
    if (n_clustered_muts == 1) {
      # Case 1: (when ignoring vaf) 1 non-other and it is either dbs or mbs
      if (n_dbs != 0) {
        cluster_type[cluster_type==''] <- 'dbs'
      }
      if (n_mbs != 0) {
        if (all(cluster_type!='')) {
          stop("Something went wrong with 'cluster_type'")
        }
        cluster_type[cluster_type==''] <- 'mbs'
      }
      # Case 2: (considering vaf), 1 non-other and n other.
      if (n_dbs == 0 & n_mbs == 0) {
        cluster_type[cluster_type==''] <- 'other'
      } # 2022-05-01 patched
    } else if (n_clustered_muts <= omikli_upto_n_muts) {
      cluster_type[cluster_type==''] <- 'omikli'
    } else if (n_clustered_muts > omikli_upto_n_muts) {
      cluster_type[cluster_type==''] <- 'kataegis'
    } else {
      stop('Can it ever reach here?')
    }
    return(cluster_type)
  }
  # For test,
  # 0:
  # imd <- c(0,0)
  # imd <- c(1,1,2,1,1,1,1,2,2,1,1,2,2,2,1,1)
  # 1:
  # imd <- dat[chr=='1' & cluster_id == '1',imd]
  # cluster_id <- dat[chr=='1' & cluster_id == '1',cluster_id]
  # vaf <- dat[chr=='1' & cluster_id == '1',VAF]
  # 2:
  # imd <- dat[chr=='2' & cluster_id == '3',imd]
  # cluster_id <- dat[chr=='2' & cluster_id == '3',cluster_id]
  # vaf <- dat[chr=='2' & cluster_id == '3',VAF]
  # 3:
  # x <- out_all_in_one[samples=='A3A_1st_C3_3ug-2' & chr == '22']
  # x <- dat[samples=='A3A_1st_C3' & chr == '1' & cluster_id == '8']
  # imd <- x$imd
  # vaf <- x$VAF
  # subs_type <- x$subs_type
  # vaf_diff_cutoff <- 0.1
  
  
  create_dir <- function(dir) {
    if (dir.exists(dir)) {
      cat('A directory',dir,'already exists.\n')
    } else {
      dir.create(dir,recursive=TRUE)
      cat('A new directory',dir,'has been created.\n')
    }
  }
  
  
  annotate_sequence_context <- function(dt,
                                        n_flanking_bases_5p=1,
                                        n_flanking_bases_3p=1,
                                        chr_name='chr',
                                        pos_name='start',
                                        genome=BSgenome.Hsapiens.Ensembl.GRCh37.jolim){
    gr_5p <- GRanges(seqnames=dt[[chr_name]],ranges=IRanges(start=dt[[pos_name]] - n_flanking_bases_5p,end=dt[[pos_name]] - 1))
    gr_3p <- GRanges(seqnames=dt[[chr_name]],ranges=IRanges(start=dt[[pos_name]] + 1,end=dt[[pos_name]] + n_flanking_bases_3p))
    seqCtx_5p <- getSeq(genome,gr_5p)
    seqCtx_3p <- getSeq(genome,gr_3p)
    dt[,seqCtx_5p:=as.vector(seqCtx_5p)]
    dt[,seqCtx_3p:=as.vector(seqCtx_3p)]
  }
  # For test,
  # dt = out[[1]]
  
  if (FALSE) {
    # ------------------------------------------------------------------------------
    # NOTE: 2022-04-28,
    # Setup
    main_dir<-'/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/discovery_simulation/add_simulation/unique_vaf/fin/02_results/v4'
    #main_dir <- '/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation'
    outdir <- file.path(main_dir,'APOBEC_clustered_mutations/annotated')
    cluster_result_dir <- file.path(main_dir,'APOBEC_clustered_mutations')
    
    # Load data
    input_vcf_filepaths <- list.files(file.path(main_dir,'input_vcfs_w_ccf_vaf'),full=TRUE)
    cluster_result_filepaths <- list.files(cluster_result_dir, pattern='_clustered.txt', full.names=TRUE)
    #imd_cutoffs <- fread('/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/APOBEC_clustered_mutations/APOBEC_clustered_mutations_imd_cutoff.tsv')
    imd_cutoffs<-fread('/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/25_simulation/jolim_rerun/v2/imds/APOBEC_clustered_mutations_imd_cutoff.tsv')
    
    # Setup
    metadata <- imd_cutoffs
    metadata <- data.table(Sample=input_vcf_filepaths %>% basename %>% gsub("\\.mutect2.*","",.),
                           input_vcf_filepaths=input_vcf_filepaths) %>%
      merge(metadata,.,by='Sample')
    metadata <- data.table(Sample=cluster_result_filepaths %>% basename %>% gsub("\\.APOBEC.*","",.),
                           cluster_result_filepaths=cluster_result_filepaths) %>%
      merge(metadata,.,by='Sample')
    metadata
    # VAF distribution
    # lapply(metadata$input_vcf_filepaths, function(f){
    #     # f <- metadata$input_vcf_filepaths[1]
    #     fread(f)$CCF %>% hist(breaks=1000)
    # })
    
    # Paramters
    omikli_upto_n_muts <- c(3,4,5)
    
    # vaf considered
    # --------------
    out <- pbmclapply(1:nrow(metadata), function(i){
      # cat('Processing:',i,'/',nrow(metadata),'\n')#,'\r')
      # NOTE: A3B_1st_C5_100ng_48h_SC-1 doesn't have CCF.
      # i <- 1
      # i <- 6 # which(metadata$Sample == 'A3A_1st_C3_3ug-2')
      # i <- which(metadata$Sample=='A3B_1st_C5_100ng_48h_SC-1') # an exception case
      #i=1
      sample_id <- metadata[i,Sample]
      imd_cutoff <- metadata[i,IMD_cutoff]
      input_vcf <- fread(metadata[i,input_vcf_filepaths])
      input_vcf[,`#CHROM`:=as.character(`#CHROM`)]
      cluster_result <- fread(metadata[i,cluster_result_filepaths],fill=TRUE)
      cluster_result[,chr:=as.character(chr)]
      if (sample_id == 'A3B_1st_C5_100ng_48h_SC-1') {
        # input_vcf <- fread(metadata[Sample=='A3B_1st_C5_100ng_48h_SC-1',input_vcf_filepaths])
        input_vcf[,CCF:=input_vcf[,as.numeric(gsub('.*VAF=([^;|^\t])','\\1',INFO))]]
      }
      cluster_result <- merge(cluster_result,input_vcf[,.(`#CHROM`,POS,CCF)],by.x=c('chr','start'),by.y=c('#CHROM','POS'))
      setnames(cluster_result,'CCF','VAF')
      dat <- cluster_result
      dat <- dat[order(start)][mixedorder(chr)]
      annotate_sequence_context(dat)
      dat[,imd:=get_imd(start),by=chr]
      dat[,imd_cutoff:=imd_cutoffs[Sample==sample_id,IMD_cutoff]]
      dat[,cluster_id:=assign_cluster(start,imd_cutoff),by=chr] # Time consuming
      dat[,VAF:=as.numeric(VAF)]
      vaf_diffs <- dat[,abs(VAF - data.table::shift(VAF,1))[-1],by=chr]$V1 %>% sort # NOTE: 아포벡 context라면 vaf가 수치상으로 좀 달라도 같은 event일 것 같다. depth 흔들림에 의한 현상일 것으로 생각됨. - Yohan
      # vaf_diff_cutoff <- mean(vaf_diffs)
      vaf_diff_cutoff <- vaf_diffs[floor(length(vaf_diffs)*0.90)]
      # vaf_diff_cutoff <- Inf # To ignore vaf
      # dat[,vaf_diff_cutoff:=vaf_diff_cutoff]
      # Ex.)
      # 9       128508903       APOBEC  A3A_1st_C3_3ug-3        .       GRCh37  SNP     128508903       G       C       SOMATIC 2466126 163             0.833333333333  163     7218    7       other   other   other
      # 9       128509066       APOBEC  A3A_1st_C3_3ug-3        .       GRCh37  SNP     128509066       G       C       SOMATIC 163     163             1.11627906977   163     7218    7       other   other   other
      #
      # 0.833333333333 vs. 1.11627906977 -> DP각각 49, 61
      #
      dat[,subs_type:=classify_substitution_type(imd,VAF,vaf_diff_cutoff),by=c('chr','cluster_id')]
      for (omikli_cutoff in omikli_upto_n_muts) {
        dat[,cluster_type:=classify_cluster_type(imd,subs_type,VAF,vaf_diff_cutoff,omikli_upto_n_muts=omikli_cutoff),by=c('chr','cluster_id')]
        setnames(dat,'cluster_type',paste0('cluster_type_omikli_upto_',omikli_cutoff))
      }
      setnames(dat,'VAF','CCF_or_VAF')
      dat
    },mc.cores=10) %>% setNames(metadata$Sample)
    out_all_in_one <- out %>% do.call(rbind,.)
    
    # Save to files
    outdir_vaf_considered <- file.path(outdir,'vaf_considered')
    create_dir(outdir_vaf_considered)
    for (i in 1:length(out)) {
      outfile <- file.path(outdir_vaf_considered,paste0(names(out[i]),'.APOBEC_clustered.annotated.tsv'))
      fwrite(out[[i]],outfile,sep='\t')
    }
    all_outfile <- file.path(outdir_vaf_considered,'APOBEC_clustered_mutations.all_samples.annotated.tsv')
    fwrite(out_all_in_one,all_outfile,sep='\t')
    
    
    # vaf ignored
    # -----------
    out_vaf_ignored <- pbmclapply(1:nrow(metadata), function(i){
      sample_id <- metadata[i,Sample]
      imd_cutoff <- metadata[i,IMD_cutoff]
      input_vcf <- fread(metadata[i,input_vcf_filepaths])
      input_vcf[,`#CHROM`:=as.character(`#CHROM`)]
      cluster_result <- fread(metadata[i,cluster_result_filepaths],fill=TRUE)
      cluster_result[,chr:=as.character(chr)]
      if (sample_id == 'A3B_1st_C5_100ng_48h_SC-1') {
        # input_vcf <- fread(metadata[Sample=='A3B_1st_C5_100ng_48h_SC-1',input_vcf_filepaths])
        input_vcf[,CCF:=input_vcf[,as.numeric(gsub('.*VAF=([^;|^\t])','\\1',INFO))]]
      }
      cluster_result <- merge(cluster_result,input_vcf[,.(`#CHROM`,POS,CCF)],by.x=c('chr','start'),by.y=c('#CHROM','POS'))
      setnames(cluster_result,'CCF','VAF')
      dat <- cluster_result
      dat <- dat[order(start)][mixedorder(chr)]
      annotate_sequence_context(dat)
      dat[,imd:=get_imd(start),by=chr]
      dat[,imd_cutoff:=imd_cutoffs[Sample==sample_id,IMD_cutoff]]
      dat[,cluster_id:=assign_cluster(start,imd_cutoff),by=chr] # Time consuming
      dat[,VAF:=as.numeric(VAF)]
      vaf_diffs <- dat[,abs(VAF - data.table::shift(VAF,1))[-1],by=chr]$V1 %>% sort # NOTE: 아포벡 context라면 vaf가 수치상으로 좀 달라도 같은 event일 것 같다. depth 흔들림에 의한 현상일 것으로 생각됨. - Yohan
      # vaf_diff_cutoff <- vaf_diffs[floor(length(vaf_diffs)*0.90)]
      vaf_diff_cutoff <- Inf # To ignore vaf
      dat[,subs_type:=classify_substitution_type(imd,VAF,vaf_diff_cutoff),by=c('chr','cluster_id')]
      for (omikli_cutoff in omikli_upto_n_muts) {
        dat[,cluster_type:=classify_cluster_type(imd,subs_type,VAF,vaf_diff_cutoff,omikli_upto_n_muts=omikli_cutoff),by=c('chr','cluster_id')]
        setnames(dat,'cluster_type',paste0('cluster_type_omikli_upto_',omikli_cutoff))
      }
      setnames(dat,'VAF','CCF_or_VAF')
      dat
    },mc.cores=10) %>% setNames(metadata$Sample)
    out_vaf_ignored_all_in_one <- out_vaf_ignored %>% do.call(rbind,.)
    
    # Save to files
    outdir_vaf_ignored <- file.path(outdir,'vaf_ignored')
    create_dir(outdir_vaf_ignored)
    for (i in 1:length(out)) {
      outfile <- file.path(outdir_vaf_ignored,paste0(names(out[i]),'.APOBEC_clustered.annotated.tsv'))
      fwrite(out_vaf_ignored[[i]],outfile,sep='\t')
    }
    all_outfile <- file.path(outdir_vaf_ignored,'APOBEC_clustered_mutations.all_samples.annotated.tsv')
    fwrite(out_vaf_ignored_all_in_one,all_outfile,sep='\t')
    
    
    
    # Plotting figures -------------------------------------------------------------
    # all_outfile <- file.path(outdir,'APOBEC_clustered_mutations.all_samples.annotated.tsv')
    all_outfile <- file.path(outdir,'vaf_considered','APOBEC_clustered_mutations.all_samples.annotated.tsv')
    # all_outfile <- file.path(outdir,'vaf_considered','APOBEC_clustered_mutations.all_samples.annotated.tsv')
    out_all_in_one <- fread(all_outfile)
    pcawg_metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/target_sample.txt")
    fig_outdir <- '/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/timing_clonality/v2/vcf/separate/figures'
    pcawg_metadata%>%dplyr::select(Tumor_Sample_Barcode,Project_Code)
    blca_out<-left_join(out_all_in_one,pcawg_metadata%>%dplyr::select(Tumor_Sample_Barcode,Project_Code),by=c("samples"="Tumor_Sample_Barcode"))%>%
      filter(grepl("BLCA",Project_Code))
    out_all_in_one<-blca_out
    # The number of events vs. mutations per event
    # omikli up to 3
    omikli <- out_all_in_one[cluster_type_omikli_upto_3 == 'omikli',.N,by=.(samples,chr,cluster_id)]$N
    omikli.f <- factor(omikli, levels=2:max(omikli), ordered=TRUE) %>% table
    kataegis <- out_all_in_one[cluster_type_omikli_upto_3 == 'kataegis',.N,by=.(samples,chr,cluster_id)]$N
    kataegis.f <- factor(kataegis, levels=2:max(kataegis), ordered=TRUE) %>% table
    
    # omikli up to 4
    omikli <- out_all_in_one[cluster_type_omikli_upto_4 == 'omikli',.N,by=.(samples,chr,cluster_id)]$N
    omikli.f <- factor(omikli, levels=2:max(omikli), ordered=TRUE) %>% table
    kataegis <- out_all_in_one[cluster_type_omikli_upto_4 == 'kataegis',.N,by=.(samples,chr,cluster_id)]$N
    kataegis.f <- factor(kataegis, levels=2:max(kataegis), ordered=TRUE) %>% table
    
    # omikli up to 5
    omikli <- out_all_in_one[cluster_type_omikli_upto_5 == 'omikli',.N,by=.(samples,chr,cluster_id)]$N
    omikli.f <- factor(omikli, levels=2:max(omikli), ordered=TRUE) %>% table
    kataegis <- out_all_in_one[cluster_type_omikli_upto_5 == 'kataegis',.N,by=.(samples,chr,cluster_id)]$N
    kataegis.f <- factor(kataegis, levels=2:max(kataegis), ordered=TRUE) %>% table
    
    # Poisson mixture fitting
    n_muts.df <- data.table(n_muts=c(omikli, kataegis) - 2) # given 2 mutations
    flexfit <- vector('list',5)
    lambdas <- vector('list',5)
    for (k in 1:5) {
      flexfit[[k]] <- flexmix(n_muts ~ 1, data=n_muts.df, k=k, model=FLXglm(family='poisson'))
      lambdas[[k]] <- exp(parameters(flexfit[[k]]))
    }
    AICs <- flexfit %>% sapply(function(x){summary(x)@AIC})
    rel_lik <- exp(0.5*(min(AICs) - AICs))
    
    # Relative likelihood vs. number of components
    p_relLik <- ggplot(data.table(n_comp=seq_along(rel_lik),rel_lik=rel_lik), aes(x=n_comp,y=rel_lik)) +
      geom_line() +
      geom_point() +
      theme_classic() +
      xlab('Number of components') +
      ylab('Relative likelihood')
    p_relLik
    ggsave(file.path(fig_outdir,'relative_likelihood_vs_number_of_components.png'),p_relLik,scale=0.5)
    
    # Poisson density vs. Number of mutations - 2
    lambda1 <- lambdas[[2]][1]
    lambda2 <- lambdas[[2]][2]
    pois <- function(x,lambda) {
      o <- sapply(x[1]:x[length(x)],function(k){
        (exp(-lambda) * lambda^k)/factorial(k)
      })
      names(o) <- NULL
      o
    }
    comp1 <- pois(0:10,lambda1)
    comp2 <- pois(0:10,lambda2)
    cols <- hue_pal()(2)
    poisson_df <- data.table(x=0:10,`Component 1`=comp1,`Component 2`=comp2)
    p_pois_d <- ggplot(poisson_df %>% melt(measure.vars=c('Component 1','Component 2')) %>% setnames('variable','Poisson mixture'), aes(x=x,y=value,fill=`Poisson mixture`,color=`Poisson mixture`)) +
      geom_line() +
      geom_point() +
      theme_classic() +
      scale_x_discrete(name ="Number of mutations - 2", limits=0:10) +
      ylab('Poisson density') +
      geom_vline(xintercept=lambda1,color=cols[1]) +
      geom_vline(xintercept=lambda2,color=cols[2]) +
      theme(legend.position = c(0.8, 0.5))
    poisson_df
    p_pois_d
    ggsave(file.path(fig_outdir,'poisson_density_vs_number_of_mutations_minus_2.png'),p_pois_d,scale=0.5)
    
    
    # Number of events vs. mutations per event
    LR_test(flexfit[[1]],R=500) # BS=500, alternative='greater', k=1, p=0.002
    LR_test(flexfit[[2]],R=500) # BS=500, alternative='greater', k=2, p=0.2385
    # NOTE: the fitted results of k=2,3,4,5 are all the same.
    muts_n_events.dt <- data.table(cluster_type=c(rep('omikli',length(omikli.f)),rep('kataegis',length(kataegis.f))),
                                   muts_per_event=as.integer(c(names(omikli.f),names(kataegis.f))),
                                   n_events=c(omikli.f,kataegis.f))
    n_events <- muts_n_events.dt[,.(n_events=sum(n_events)),by=cluster_type]
    fitted_n_events <- data.table(muts_per_event=0:10 + 2,
                                  omikli=poisson_df$`Component 1` * n_events[cluster_type=='omikli',n_events],
                                  kataegis=poisson_df$`Component 2` * n_events[cluster_type=='kataegis',n_events])
    fitted_n_events_1_comp <- data.table(muts_per_event=0:10 + 2,
                                         n_events=pois(0:10,lambdas[[1]][1]) * n_events[,sum(n_events)])
    p_n_events <- ggplot(muts_n_events.dt) +
      theme_classic() +
      geom_bar(aes(x=muts_per_event,y=n_events,fill=forcats::fct_rev(cluster_type)), position='stack', stat='identity') +
      xlab('Mutations per event') +
      ylab('Number of events') +
      labs(fill='Cluster type') +
      # theme(legend.position = 'top') +
      # theme(legend.position = c(0.5,0.8)) +
      scale_y_break(c(200,1000),scale=0.5) +
      geom_line(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color='black', size=1.5) +
      geom_line(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color=cols[1], size=0.5) +
      geom_point(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color='black', size=2) +
      geom_point(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color=cols[1], size=1) +
      geom_line(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color='black', size=1.5) +
      geom_line(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color=cols[2], size=0.5) +
      geom_point(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color='black', size=2) +
      geom_point(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color=cols[2], size=1) +
      geom_line(data=fitted_n_events_1_comp, aes(x=muts_per_event,y=n_events), color='black', size=0.75) +
      geom_point(data=fitted_n_events_1_comp, aes(x=muts_per_event,y=n_events), color='black', size=0.75) +
      annotate("text",x=15, y=50, label="LR test\nk=1, p=0.002\nk=2, p=0.2385")
    # x_max <- muts_n_events.dt[n_events!=0,max(muts_per_event)]
    # p_n_events <- reposition_legend(p_n_events,x=0.5,y=0.8,just=0.5)#'top')
    p_n_events
    
    ggsave(file.path(fig_outdir,'number_of_events_vs_mutations_per_event.png'),p_n_events,scale=0.5)
    
    p_n_events_wo_legend <- ggplot(muts_n_events.dt) +
      theme_classic() +
      geom_bar(aes(x=muts_per_event,y=n_events,fill=forcats::fct_rev(cluster_type)), position='stack', stat='identity') +
      xlab('Mutations per event') +
      ylab('Number of events') +
      labs(fill='Cluster type') +
      scale_y_break(c(200,1000),scale=0.5) +
      geom_line(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color='black', size=1.5) +
      geom_line(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color=cols[1], size=0.5) +
      geom_point(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color='black', size=2) +
      geom_point(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color=cols[1], size=1) +
      geom_line(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color='black', size=1.5) +
      geom_line(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color=cols[2], size=0.5) +
      geom_point(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color='black', size=2) +
      geom_point(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color=cols[2], size=1) +
      geom_line(data=fitted_n_events_1_comp, aes(x=muts_per_event,y=n_events), color='black', size=0.75) +
      geom_point(data=fitted_n_events_1_comp, aes(x=muts_per_event,y=n_events), color='black', size=0.75) +
      annotate("text",x=15, y=50, label="LR test\nk=1, p=0.002\nk=2, p=0.2385") +
      theme(legend.position = 'none')
    
    p_n_events_wo_legend
    ggsave(file.path(fig_outdir,'number_of_events_vs_mutations_per_event.wo_legend.png'),p_n_events_wo_legend,scale=0.5)
    
    
    # omikli 3, 4, vs. 5
    # 3:
    flexfit[[2]]@logLik # -1246.347
    flexfit[[2]]@df # 3
    # 4: -1246.347
    # 5: -1246.348
    
    
  }
  
  
  ################################################################################
  # FIGURE: Clustered mutations :: rainfall plot (2022-08-03)
  # ------------------------------------------------------------------------------
  library(grid)
  library(karyoploteR)
  library(ggplotify)
  figure_dir <- '/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/figures/clustered_mutation'
  cluster_result_dir <- '/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/APOBEC_clustered_mutations'
  # APOBEC_cl_anno_vaf_considered_filelist <- list.files(file.path(cluster_result_dir,'annotated/vaf_considered'),full.names=TRUE)
  APOBEC_cl_anno_vaf_ignored_filelist <- list.files(file.path(cluster_result_dir,'annotated/vaf_ignored'),full.names=TRUE)
  APOBEC_non_cl_filelist <- list.files(cluster_result_dir,full.names=TRUE,pattern='APOBEC_nonClustered.txt')
  APOBEC_cutoff_values <- fread(file.path(cluster_result_dir,'APOBEC_clustered_mutations_imd_cutoff.tsv'))
  
  # cl <- APOBEC_cl_anno_vaf_considered_filelist %>% basename %>% gsub('.APOBEC_clustered.annotated.tsv','',.)
  cl <- APOBEC_cl_anno_vaf_ignored_filelist %>% basename %>% gsub('.APOBEC_clustered.annotated.tsv','',.)
  ncl <- APOBEC_non_cl_filelist %>% basename %>% gsub('.APOBEC_nonClustered.txt','',.)
  APOBEC_cl_mut_filelist <- data.table(
    ID=ncl,
    # APOBEC_cl=APOBEC_cl_anno_vaf_considered_filelist[match(ncl,cl)],
    APOBEC_cl=APOBEC_cl_anno_vaf_ignored_filelist[match(ncl,cl)],
    APOBEC_ncl=APOBEC_non_cl_filelist)
  APOBEC_cutoff_values <- APOBEC_cutoff_values[match(APOBEC_cl_mut_filelist$ID,APOBEC_cutoff_values$Sample),]
  
  create_dir(figure_dir)
  
  for (i in 1:nrow(APOBEC_cl_mut_filelist)) {
    
    # i <- 1
    # i <- 13
    # f <- APOBEC_cl_anno_vaf_considered_filelist[1]
    # f <- APOBEC_cl_anno_vaf_ignored_filelist[1]
    
    # clustered mutations
    sample_id <- APOBEC_cl_mut_filelist[i,ID]
    x <- fread(APOBEC_cl_mut_filelist[i,APOBEC_cl])
    x$chr <- x$chr %>% gsub('MT','M',.) %>% paste0('chr',.)
    gr_cl <- GRanges(seqnames=x$chr,ranges=IRanges(x$start,x$end))
    mcols(gr_cl) <- data.frame(mut.type='subs',ref=x$ref,alt=x$alt)
    
    # non-clustered mutations
    y <- fread(APOBEC_cl_mut_filelist[i,APOBEC_ncl],fill=TRUE)
    y$chr <- y$chr %>% gsub('MT','M',.) %>% paste0('chr',.)
    gr_ncl <- GRanges(seqnames=y$chr,ranges=IRanges(y$start,y$end))
    mcols(gr_ncl) <- data.frame(mut.type='subs',ref=y$ref,alt=y$alt)
    
    # merge
    gr <- c(gr_cl, gr_ncl) %>% sort
    
    # Reverse complement
    # idx_rc <- gr$alt == 'A' | gr$alt == 'G'
    idx_rc <- gr$ref == 'A' | gr$ref == 'G'
    ref_rc <- gr[idx_rc]$ref %>% chartr('ACGT','TGCA',.)
    alt_rc <- gr[idx_rc]$alt %>% chartr('ACGT','TGCA',.)
    gr[idx_rc]$ref <- ref_rc
    gr[idx_rc]$alt <- alt_rc
    
    # plot
    pp <- getDefaultPlotParams(plot.type = 4)
    pp$data1inmargin <- 0
    pp$bottommargin <- 20
    pp$leftmargin <- 0.1
    pp$rightmargin <- 0.1
    
    # Plot color
    # ----------
    # Color preset
    # variant.colors <- getVariantsColors(gr$ref, gr$alt)
    # variant.colors["C>T"] <- "#e40611" # red
    # variant.colors["T>C"] <- "#fbe800" # yellow
    
    # Custom colors
    chgs <- paste0(gr$ref,'>',gr$alt)
    variant.colors <- rep("grey",length(chgs)) %>% setNames(chgs)
    # variant.colors[chgs == 'A>G'] <- '#A1CF64'
    variant.colors[chgs == 'C>A'] <- '#1EBFF0'
    variant.colors[chgs == 'C>T'] <- '#E62725'
    variant.colors[chgs == 'C>G'] <- '#050708'
    variant.colors[chgs == 'T>A'] <- '#CBCACB'
    variant.colors[chgs == 'T>C'] <- '#A1CF64'
    variant.colors[chgs == 'T>G'] <- '#EDC8C5'
    
    # for cutoff line
    # kpAxis_ymax <- log10(max(end(gr[-1]) - start(gr[-length(gr)])))
    kpAxis_ymax <- 8
    cutoff_line_y <- log10(APOBEC_cutoff_values$IMD_cutoff[i]) / kpAxis_ymax
    
    outfilename <- paste0(sample_id,'.rainfall_plot.pdf')
    pdf(file.path(figure_dir,outfilename),width=11)
    # x <- as.grob(expression({
    kp <- plotKaryotype(plot.type=4, ideogram.plotter = NULL,
                        labels.plotter = NULL, plot.params = pp)
    
    # Remove chrY
    kp$chromosomes <- kp$chromosomes %>% setdiff('chrY')
    kp$chromosome.lengths <- kp$chromosome.lengths[kp$chromosomes]
    kp$cytobands <- kp$cytobands[seqnames(kp$cytobands) %in% kp$chromosomes]
    kp$genome <- kp$genome[seqnames(kp$genome) %in% kp$chromosomes]
    kp$plot.region <- kp$plot.region[seqnames(kp$plot.region) %in% kp$chromosomes]
    
    kpAddCytobandsAsLine(kp, lwd=7)
    kpAddChromosomeNames(kp, srt=45)
    kpAddMainTitle(kp, main=sample_id, cex=1.2)
    kpAxis(kp, ymax = kpAxis_ymax, tick.pos = 0:8)
    kpPlotRainfall(kp,
                   data = gr,
                   col=variant.colors,
                   ymin=0,
                   ymax=8)#kpAxis_ymax)
    # kpPlotRainfall(kp, data = gr)
    kpAddLabels(kp, labels = c("Distance between mutations (log10)"), srt=90, pos=1, label.margin = 0.07)
    kpSegments(kp,
               chr=kp$chromosomes,
               x0=0,
               x1=kp$chromosome.length+1E7,
               y0=cutoff_line_y,
               y1=cutoff_line_y,
               col='red',
               clipping=FALSE)
    par(xpd=TRUE)
    legend(x = "bottomright",
           fill = c('#1EBFF0', '#050708', '#E62725', '#CBCACB', '#A1CF64', '#EDC8C5'),
           legend = c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"),
           inset=c(-0.01,0.075))
    
    # kpText(kp, chr="chr1", x=x, y=y, labels='cutoff', col='red')
    # }))
    # grid.newpage()
    # grid.draw(x)
    dev.off()
    
  }
  # Source:
  # [1] https://bernatgel.github.io/karyoploter_tutorial//Examples/Rainfall/Rainfall.html
  
  
  
  
  
  
  
  
  
  
  # Test bed (2022-04-28) --------------------------------------------------------
  ## flexmix
  #
  # Not used ---------------------------------------------------------------------
  # mixtools
  ## EM output for data generated from a 2-component model.
  # set.seed(100)
  # beta <- matrix(c(1, .5, .7, -.8), 2, 2)
  # x <- runif(50, 0, 10)
  # xbeta <- cbind(1, x)%*%beta
  # w <- rbinom(50, 1, .5)
  # y <- w*rpois(50, exp(xbeta[, 1]))+(1-w)*rpois(50, exp(xbeta[, 2]))
  # out <- poisregmixEM(y, x, verb = TRUE, epsilon = 1e-03)
  # out
  #
  # ------------------------------------------------------------------------------
  # A script for finding an IMD for each sample
  # ------------------------------------------------------------------------------
  # NOTE: The followings have been replaced by using SigProfilerCluster.
  # ------------------------------------------------------------------------------
  # obsFile_path <- "/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter/A3A_1st_C3_100ng-1.mutect2_strelka2_union.snvs.vcf.seqzcn.scF.bino1-4P.prob.clonal.vcf"
  # simDir_path <- "/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/00_clustered_mutation/A3A_1st_C3/output/simulations/APOBEC_simulations_GRCh37_96/"
  # simFile_paths <- list.files(simDir_path,full=TRUE)
  # x <- fread(obsFile_path)
  # y <- fread(simFile_path)
  # sims <- lapply(1:length(simFile_paths), function(i){
  #     y <- fread(simFile_paths[i])
  #     y <- y[order(Start_position)][mixedorder(Chrom)][,.(Chrom,POS=Start_position)]
  #     y[,run:=i]
  # }) %>% do.call(rbind,.)
  # sims
  
  # # Sort and subset x
  # x <- x[order(POS)][mixedorder(`#CHROM`)][,.(Chrom=`#CHROM`,POS)]
  # x_imd <- x[,.(imd=get_imd(POS)),by=Chrom]
  
  # IMD <- 1000
  # # sum(x_imd$imd >= IMD)
  # sum(x_imd$imd < IMD)
  # sims[run==3,.(imd=get_imd(POS)),by=Chrom][,sum(imd<IMD)]
  
  # # Find the IMD that makes 90% of the simulation runs produce at least one cluster.
  # IMD <- 1000 # initial value
  # increment <- 100
  # alpha <- 0.5
  # increment_direction <- 0
  # fdr <- 0.1
  # while (TRUE) {
  #     cat('IMD:',IMD)
  #     sims_imd <- sims[,.(imd=get_imd(POS)),by=.(run,Chrom)]
  #     o <- sum(sims_imd[,sum(imd < IMD) >= 1,by=run]$V1)/100
  #     cat(' -> FDR',1-o,'\n')
  #     if (o == (1 - fdr)) {
  #         break
  #     }
  #     if (o < (1 - fdr)) {
  #         if (increment_direction %in% c(-1,0)) {
  #             IMD <- IMD + increment
  #         } else if (increment_direction == 1) {
  #             IMD <- IMD - increment
  #             increment <- increment * alpha
  #             increment_direction <- 0
  #         }
  #     } else if (o > (1 - fdr)) {
  #         if (increment_direction %in% c(1,0)) {
  #             IMD <- IMD - increment
  #         } else if (increment_direction == -1) {
  #             IMD <- IMD + increment
  #             increment <- increment * alpha
  #             increment_direction <- 0
  #         }
  #     }
  #     # Sys.sleep(0.01)
  # }
  
  
  
  
  main_dir<-'/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/timing_clonality/v2/vcf/separate'
  #main_dir <- '/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation'
  outdir <- file.path(main_dir,'APOBEC_clustered_mutations/annotated')
  cluster_result_dir <- file.path(main_dir,'APOBEC_clustered_mutations')
  
  # Load data
  input_vcf_filepaths <- list.files(file.path(main_dir,'00_clustered_mutation/input_vcfs_w_ccf_vaf'),full=TRUE)
  cluster_result_filepaths <- list.files(cluster_result_dir, pattern='_clustered.txt', full.names=TRUE)
  #imd_cutoffs <- fread('/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/APOBEC_clustered_mutations/APOBEC_clustered_mutations_imd_cutoff.tsv')
  imd_cutoffs<-fread('/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/timing_clonality/v2/vcf/separate/imds/APOBEC_clustered_mutations_imd_cutoff.tsv')
  
  # Setup
  metadata <- imd_cutoffs
  metadata <- data.table(Sample=input_vcf_filepaths %>% basename %>% gsub('([^.]+).*','\\1',.),
                         input_vcf_filepaths=input_vcf_filepaths) %>%
    merge(metadata,.,by='Sample')
  metadata <- data.table(Sample=cluster_result_filepaths %>% basename %>% gsub('([^.]+).*','\\1',.),
                         cluster_result_filepaths=cluster_result_filepaths) %>%
    merge(metadata,.,by='Sample')
  
  # VAF distribution
  # lapply(metadata$input_vcf_filepaths, function(f){
  #     # f <- metadata$input_vcf_filepaths[1]
  #     fread(f)$CCF %>% hist(breaks=1000)
  # })
  
  # Paramters
  omikli_upto_n_muts <- c(3,4,5)
  
  # vaf considered
  # --------------
  out <- pbmclapply(1:nrow(metadata), function(i){
    # cat('Processing:',i,'/',nrow(metadata),'\n')#,'\r')
    # NOTE: A3B_1st_C5_100ng_48h_SC-1 doesn't have CCF.
    # i <- 1
    # i <- 6 # which(metadata$Sample == 'A3A_1st_C3_3ug-2')
    # i <- which(metadata$Sample=='A3B_1st_C5_100ng_48h_SC-1') # an exception case
    #i=1
    sample_id <- metadata[i,Sample]
    imd_cutoff <- metadata[i,IMD_cutoff]
    input_vcf <- fread(metadata[i,input_vcf_filepaths])
    input_vcf[,`#CHROM`:=as.character(`#CHROM`)]
    cluster_result <- fread(metadata[i,cluster_result_filepaths],fill=TRUE)
    cluster_result[,chr:=as.character(chr)]
    if (sample_id == 'A3B_1st_C5_100ng_48h_SC-1') {
      # input_vcf <- fread(metadata[Sample=='A3B_1st_C5_100ng_48h_SC-1',input_vcf_filepaths])
      input_vcf[,CCF:=input_vcf[,as.numeric(gsub('.*VAF=([^;|^\t])','\\1',INFO))]]
    }
    cluster_result <- merge(cluster_result,input_vcf[,.(`#CHROM`,POS,VAF)],by.x=c('chr','start'),by.y=c('#CHROM','POS'))
    #setnames(cluster_result,'CCF','VAF')
    dat <- cluster_result
    dat <- dat[order(start)][mixedorder(chr)]
    annotate_sequence_context(dat)
    dat[,imd:=get_imd(start),by=chr]
    dat[,imd_cutoff:=imd_cutoffs[Sample==sample_id,IMD_cutoff]]
    dat[,cluster_id:=assign_cluster(start,imd_cutoff),by=chr] # Time consuming
    dat[,VAF:=as.numeric(VAF)]
    vaf_diffs <- dat[,abs(VAF - data.table::shift(VAF,1))[-1],by=chr]$V1 %>% sort # NOTE: 아포벡 context라면 vaf가 수치상으로 좀 달라도 같은 event일 것 같다. depth 흔들림에 의한 현상일 것으로 생각됨. - Yohan
    # vaf_diff_cutoff <- mean(vaf_diffs)
    vaf_diff_cutoff <- vaf_diffs[floor(length(vaf_diffs)*0.90)]
    # vaf_diff_cutoff <- Inf # To ignore vaf
    # dat[,vaf_diff_cutoff:=vaf_diff_cutoff]
    # Ex.)
    # 9       128508903       APOBEC  A3A_1st_C3_3ug-3        .       GRCh37  SNP     128508903       G       C       SOMATIC 2466126 163             0.833333333333  163     7218    7       other   other   other
    # 9       128509066       APOBEC  A3A_1st_C3_3ug-3        .       GRCh37  SNP     128509066       G       C       SOMATIC 163     163             1.11627906977   163     7218    7       other   other   other
    #
    # 0.833333333333 vs. 1.11627906977 -> DP각각 49, 61
    #
    dat[,subs_type:=classify_substitution_type(imd,VAF,vaf_diff_cutoff),by=c('chr','cluster_id')]
    for (omikli_cutoff in omikli_upto_n_muts) {
      dat[,cluster_type:=classify_cluster_type(imd,subs_type,VAF,vaf_diff_cutoff,omikli_upto_n_muts=omikli_cutoff),by=c('chr','cluster_id')]
      setnames(dat,'cluster_type',paste0('cluster_type_omikli_upto_',omikli_cutoff))
    }
    setnames(dat,'VAF','CCF_or_VAF')
    dat
  },mc.cores=10) %>% setNames(metadata$Sample)
  out_all_in_one <- out %>% do.call(rbind,.)
  
  # Save to files
  outdir_vaf_considered <- file.path(outdir,'vaf_considered')
  create_dir(outdir_vaf_considered)
  for (i in 1:length(out)) {
    outfile <- file.path(outdir_vaf_considered,paste0(names(out[i]),'.APOBEC_clustered.annotated.tsv'))
    fwrite(out[[i]],outfile,sep='\t')
  }
  all_outfile <- file.path(outdir_vaf_considered,'APOBEC_clustered_mutations.all_samples.annotated.tsv')
  fwrite(out_all_in_one,all_outfile,sep='\t')
  
  
  # vaf ignored
  # -----------
  out_vaf_ignored <- pbmclapply(1:nrow(metadata), function(i){
    sample_id <- metadata[i,Sample]
    imd_cutoff <- metadata[i,IMD_cutoff]
    input_vcf <- fread(metadata[i,input_vcf_filepaths])
    input_vcf[,`#CHROM`:=as.character(`#CHROM`)]
    cluster_result <- fread(metadata[i,cluster_result_filepaths],fill=TRUE)
    cluster_result[,chr:=as.character(chr)]
    if (sample_id == 'A3B_1st_C5_100ng_48h_SC-1') {
      # input_vcf <- fread(metadata[Sample=='A3B_1st_C5_100ng_48h_SC-1',input_vcf_filepaths])
      input_vcf[,CCF:=input_vcf[,as.numeric(gsub('.*VAF=([^;|^\t])','\\1',INFO))]]
    }
    cluster_result <- merge(cluster_result,input_vcf[,.(`#CHROM`,POS,CCF)],by.x=c('chr','start'),by.y=c('#CHROM','POS'))
    setnames(cluster_result,'CCF','VAF')
    dat <- cluster_result
    dat <- dat[order(start)][mixedorder(chr)]
    annotate_sequence_context(dat)
    dat[,imd:=get_imd(start),by=chr]
    dat[,imd_cutoff:=imd_cutoffs[Sample==sample_id,IMD_cutoff]]
    dat[,cluster_id:=assign_cluster(start,imd_cutoff),by=chr] # Time consuming
    dat[,VAF:=as.numeric(VAF)]
    vaf_diffs <- dat[,abs(VAF - data.table::shift(VAF,1))[-1],by=chr]$V1 %>% sort # NOTE: 아포벡 context라면 vaf가 수치상으로 좀 달라도 같은 event일 것 같다. depth 흔들림에 의한 현상일 것으로 생각됨. - Yohan
    # vaf_diff_cutoff <- vaf_diffs[floor(length(vaf_diffs)*0.90)]
    vaf_diff_cutoff <- Inf # To ignore vaf
    dat[,subs_type:=classify_substitution_type(imd,VAF,vaf_diff_cutoff),by=c('chr','cluster_id')]
    for (omikli_cutoff in omikli_upto_n_muts) {
      dat[,cluster_type:=classify_cluster_type(imd,subs_type,VAF,vaf_diff_cutoff,omikli_upto_n_muts=omikli_cutoff),by=c('chr','cluster_id')]
      setnames(dat,'cluster_type',paste0('cluster_type_omikli_upto_',omikli_cutoff))
    }
    setnames(dat,'VAF','CCF_or_VAF')
    dat
  },mc.cores=10) %>% setNames(metadata$Sample)
  out_vaf_ignored_all_in_one <- out_vaf_ignored %>% do.call(rbind,.)
  
  # Save to files
  outdir_vaf_ignored <- file.path(outdir,'vaf_ignored')
  create_dir(outdir_vaf_ignored)
  for (i in 1:length(out)) {
    outfile <- file.path(outdir_vaf_ignored,paste0(names(out[i]),'.APOBEC_clustered.annotated.tsv'))
    fwrite(out_vaf_ignored[[i]],outfile,sep='\t')
  }
  all_outfile <- file.path(outdir_vaf_ignored,'APOBEC_clustered_mutations.all_samples.annotated.tsv')
  fwrite(out_vaf_ignored_all_in_one,all_outfile,sep='\t')
  
  
  
  # Plotting figures -------------------------------------------------------------
  # all_outfile <- file.path(outdir,'APOBEC_clustered_mutations.all_samples.annotated.tsv')
  all_outfile <- file.path(outdir,'vaf_considered','APOBEC_clustered_mutations.all_samples.annotated.tsv')
  # all_outfile <- file.path(outdir,'vaf_considered','APOBEC_clustered_mutations.all_samples.annotated.tsv')
  out_all_in_one <- fread(all_outfile)
  pcawg_metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/target_sample.txt")
  fig_outdir <- '/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/timing_clonality/v2/vcf/separate/figures'
  pcawg_metadata%>%dplyr::select(Tumor_Sample_Barcode,Project_Code)
  blca_out<-left_join(out_all_in_one,pcawg_metadata%>%dplyr::select(Tumor_Sample_Barcode,Project_Code),by=c("samples"="Tumor_Sample_Barcode"))%>%
    filter(grepl("BLCA",Project_Code))
  out_all_in_one<-blca_out
  # The number of events vs. mutations per event
  # omikli up to 3
  omikli <- out_all_in_one[cluster_type_omikli_upto_3 == 'omikli',.N,by=.(samples,chr,cluster_id)]$N
  omikli.f <- factor(omikli, levels=2:max(omikli), ordered=TRUE) %>% table
  kataegis <- out_all_in_one[cluster_type_omikli_upto_3 == 'kataegis',.N,by=.(samples,chr,cluster_id)]$N
  kataegis.f <- factor(kataegis, levels=2:max(kataegis), ordered=TRUE) %>% table
  
  # omikli up to 4
  omikli <- out_all_in_one[cluster_type_omikli_upto_4 == 'omikli',.N,by=.(samples,chr,cluster_id)]$N
  omikli.f <- factor(omikli, levels=2:max(omikli), ordered=TRUE) %>% table
  kataegis <- out_all_in_one[cluster_type_omikli_upto_4 == 'kataegis',.N,by=.(samples,chr,cluster_id)]$N
  kataegis.f <- factor(kataegis, levels=2:max(kataegis), ordered=TRUE) %>% table
  
  # omikli up to 5
  omikli <- out_all_in_one[cluster_type_omikli_upto_5 == 'omikli',.N,by=.(samples,chr,cluster_id)]$N
  omikli.f <- factor(omikli, levels=2:max(omikli), ordered=TRUE) %>% table
  kataegis <- out_all_in_one[cluster_type_omikli_upto_5 == 'kataegis',.N,by=.(samples,chr,cluster_id)]$N
  kataegis.f <- factor(kataegis, levels=2:max(kataegis), ordered=TRUE) %>% table
  
  # Poisson mixture fitting
  n_muts.df <- data.table(n_muts=c(omikli, kataegis) - 2) # given 2 mutations
  flexfit <- vector('list',5)
  lambdas <- vector('list',5)
  for (k in 1:5) {
    flexfit[[k]] <- flexmix(n_muts ~ 1, data=n_muts.df, k=k, model=FLXglm(family='poisson'))
    lambdas[[k]] <- exp(parameters(flexfit[[k]]))
  }
  AICs <- flexfit %>% sapply(function(x){summary(x)@AIC})
  rel_lik <- exp(0.5*(min(AICs) - AICs))
  
  # Relative likelihood vs. number of components
  p_relLik <- ggplot(data.table(n_comp=seq_along(rel_lik),rel_lik=rel_lik), aes(x=n_comp,y=rel_lik)) +
    geom_line() +
    geom_point() +
    theme_classic() +
    xlab('Number of components') +
    ylab('Relative likelihood')
  p_relLik
  ggsave(file.path(fig_outdir,'relative_likelihood_vs_number_of_components.png'),p_relLik,scale=0.5)
  
  # Poisson density vs. Number of mutations - 2
  lambda1 <- lambdas[[2]][1]
  lambda2 <- lambdas[[2]][2]
  pois <- function(x,lambda) {
    o <- sapply(x[1]:x[length(x)],function(k){
      (exp(-lambda) * lambda^k)/factorial(k)
    })
    names(o) <- NULL
    o
  }
  comp1 <- pois(0:10,lambda1)
  comp2 <- pois(0:10,lambda2)
  cols <- hue_pal()(2)
  poisson_df <- data.table(x=0:10,`Component 1`=comp1,`Component 2`=comp2)
  p_pois_d <- ggplot(poisson_df %>% melt(measure.vars=c('Component 1','Component 2')) %>% setnames('variable','Poisson mixture'), aes(x=x,y=value,fill=`Poisson mixture`,color=`Poisson mixture`)) +
    geom_line() +
    geom_point() +
    theme_classic() +
    scale_x_discrete(name ="Number of mutations - 2", limits=0:10) +
    ylab('Poisson density') +
    geom_vline(xintercept=lambda1,color=cols[1]) +
    geom_vline(xintercept=lambda2,color=cols[2]) +
    theme(legend.position = c(0.8, 0.5))
  poisson_df
  p_pois_d
  ggsave(file.path(fig_outdir,'poisson_density_vs_number_of_mutations_minus_2.png'),p_pois_d,scale=0.5)
  
  
  # Number of events vs. mutations per event
  LR_test(flexfit[[1]],R=500) # BS=500, alternative='greater', k=1, p=0.002
  LR_test(flexfit[[2]],R=500) # BS=500, alternative='greater', k=2, p=0.2385
  # NOTE: the fitted results of k=2,3,4,5 are all the same.
  muts_n_events.dt <- data.table(cluster_type=c(rep('omikli',length(omikli.f)),rep('kataegis',length(kataegis.f))),
                                 muts_per_event=as.integer(c(names(omikli.f),names(kataegis.f))),
                                 n_events=c(omikli.f,kataegis.f))
  n_events <- muts_n_events.dt[,.(n_events=sum(n_events)),by=cluster_type]
  fitted_n_events <- data.table(muts_per_event=0:10 + 2,
                                omikli=poisson_df$`Component 1` * n_events[cluster_type=='omikli',n_events],
                                kataegis=poisson_df$`Component 2` * n_events[cluster_type=='kataegis',n_events])
  fitted_n_events_1_comp <- data.table(muts_per_event=0:10 + 2,
                                       n_events=pois(0:10,lambdas[[1]][1]) * n_events[,sum(n_events)])
  p_n_events <- ggplot(muts_n_events.dt) +
    theme_classic() +
    geom_bar(aes(x=muts_per_event,y=n_events,fill=forcats::fct_rev(cluster_type)), position='stack', stat='identity') +
    xlab('Mutations per event') +
    ylab('Number of events') +
    labs(fill='Cluster type') +
    # theme(legend.position = 'top') +
    # theme(legend.position = c(0.5,0.8)) +
    scale_y_break(c(200,1000),scale=0.5) +
    geom_line(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color='black', size=1.5) +
    geom_line(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color=cols[1], size=0.5) +
    geom_point(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color='black', size=2) +
    geom_point(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color=cols[1], size=1) +
    geom_line(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color='black', size=1.5) +
    geom_line(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color=cols[2], size=0.5) +
    geom_point(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color='black', size=2) +
    geom_point(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color=cols[2], size=1) +
    geom_line(data=fitted_n_events_1_comp, aes(x=muts_per_event,y=n_events), color='black', size=0.75) +
    geom_point(data=fitted_n_events_1_comp, aes(x=muts_per_event,y=n_events), color='black', size=0.75) +
    annotate("text",x=15, y=50, label="LR test\nk=1, p=0.002\nk=2, p=0.2385")
  # x_max <- muts_n_events.dt[n_events!=0,max(muts_per_event)]
  # p_n_events <- reposition_legend(p_n_events,x=0.5,y=0.8,just=0.5)#'top')
  p_n_events
  
  ggsave(file.path(fig_outdir,'number_of_events_vs_mutations_per_event.png'),p_n_events,scale=0.5)
  
  p_n_events_wo_legend <- ggplot(muts_n_events.dt) +
    theme_classic() +
    geom_bar(aes(x=muts_per_event,y=n_events,fill=forcats::fct_rev(cluster_type)), position='stack', stat='identity') +
    xlab('Mutations per event') +
    ylab('Number of events') +
    labs(fill='Cluster type') +
    scale_y_break(c(200,1000),scale=0.5) +
    geom_line(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color='black', size=1.5) +
    geom_line(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color=cols[1], size=0.5) +
    geom_point(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color='black', size=2) +
    geom_point(data=fitted_n_events, aes(x=muts_per_event,y=omikli), color=cols[1], size=1) +
    geom_line(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color='black', size=1.5) +
    geom_line(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color=cols[2], size=0.5) +
    geom_point(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color='black', size=2) +
    geom_point(data=fitted_n_events, aes(x=muts_per_event,y=kataegis), color=cols[2], size=1) +
    geom_line(data=fitted_n_events_1_comp, aes(x=muts_per_event,y=n_events), color='black', size=0.75) +
    geom_point(data=fitted_n_events_1_comp, aes(x=muts_per_event,y=n_events), color='black', size=0.75) +
    annotate("text",x=15, y=50, label="LR test\nk=1, p=0.002\nk=2, p=0.2385") +
    theme(legend.position = 'none')
  
  p_n_events_wo_legend
  ggsave(file.path(fig_outdir,'number_of_events_vs_mutations_per_event.wo_legend.png'),p_n_events_wo_legend,scale=0.5)
  
  
  # omikli 3, 4, vs. 5
  # 3:
  flexfit[[2]]@logLik # -1246.347
  flexfit[[2]]@df # 3
  # 4: -1246.347
  # 5: -1246.348
  
  
}


################################################################################
# FIGURE: Clustered mutations :: rainfall plot (2022-08-03)
# ------------------------------------------------------------------------------
library(grid)
library(karyoploteR)
library(ggplotify)
figure_dir <- '/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/figures/clustered_mutation'
cluster_result_dir <- '/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/APOBEC_clustered_mutations'
# APOBEC_cl_anno_vaf_considered_filelist <- list.files(file.path(cluster_result_dir,'annotated/vaf_considered'),full.names=TRUE)
APOBEC_cl_anno_vaf_ignored_filelist <- list.files(file.path(cluster_result_dir,'annotated/vaf_ignored'),full.names=TRUE)
APOBEC_non_cl_filelist <- list.files(cluster_result_dir,full.names=TRUE,pattern='APOBEC_nonClustered.txt')
APOBEC_cutoff_values <- fread(file.path(cluster_result_dir,'APOBEC_clustered_mutations_imd_cutoff.tsv'))

# cl <- APOBEC_cl_anno_vaf_considered_filelist %>% basename %>% gsub('.APOBEC_clustered.annotated.tsv','',.)
cl <- APOBEC_cl_anno_vaf_ignored_filelist %>% basename %>% gsub('.APOBEC_clustered.annotated.tsv','',.)
ncl <- APOBEC_non_cl_filelist %>% basename %>% gsub('.APOBEC_nonClustered.txt','',.)
APOBEC_cl_mut_filelist <- data.table(
  ID=ncl,
  # APOBEC_cl=APOBEC_cl_anno_vaf_considered_filelist[match(ncl,cl)],
  APOBEC_cl=APOBEC_cl_anno_vaf_ignored_filelist[match(ncl,cl)],
  APOBEC_ncl=APOBEC_non_cl_filelist)
APOBEC_cutoff_values <- APOBEC_cutoff_values[match(APOBEC_cl_mut_filelist$ID,APOBEC_cutoff_values$Sample),]

create_dir(figure_dir)

for (i in 1:nrow(APOBEC_cl_mut_filelist)) {
  
  # i <- 1
  # i <- 13
  # f <- APOBEC_cl_anno_vaf_considered_filelist[1]
  # f <- APOBEC_cl_anno_vaf_ignored_filelist[1]
  
  # clustered mutations
  sample_id <- APOBEC_cl_mut_filelist[i,ID]
  x <- fread(APOBEC_cl_mut_filelist[i,APOBEC_cl])
  x$chr <- x$chr %>% gsub('MT','M',.) %>% paste0('chr',.)
  gr_cl <- GRanges(seqnames=x$chr,ranges=IRanges(x$start,x$end))
  mcols(gr_cl) <- data.frame(mut.type='subs',ref=x$ref,alt=x$alt)
  
  # non-clustered mutations
  y <- fread(APOBEC_cl_mut_filelist[i,APOBEC_ncl],fill=TRUE)
  y$chr <- y$chr %>% gsub('MT','M',.) %>% paste0('chr',.)
  gr_ncl <- GRanges(seqnames=y$chr,ranges=IRanges(y$start,y$end))
  mcols(gr_ncl) <- data.frame(mut.type='subs',ref=y$ref,alt=y$alt)
  
  # merge
  gr <- c(gr_cl, gr_ncl) %>% sort
  
  # Reverse complement
  # idx_rc <- gr$alt == 'A' | gr$alt == 'G'
  idx_rc <- gr$ref == 'A' | gr$ref == 'G'
  ref_rc <- gr[idx_rc]$ref %>% chartr('ACGT','TGCA',.)
  alt_rc <- gr[idx_rc]$alt %>% chartr('ACGT','TGCA',.)
  gr[idx_rc]$ref <- ref_rc
  gr[idx_rc]$alt <- alt_rc
  
  # plot
  pp <- getDefaultPlotParams(plot.type = 4)
  pp$data1inmargin <- 0
  pp$bottommargin <- 20
  pp$leftmargin <- 0.1
  pp$rightmargin <- 0.1
  
  # Plot color
  # ----------
  # Color preset
  # variant.colors <- getVariantsColors(gr$ref, gr$alt)
  # variant.colors["C>T"] <- "#e40611" # red
  # variant.colors["T>C"] <- "#fbe800" # yellow
  
  # Custom colors
  chgs <- paste0(gr$ref,'>',gr$alt)
  variant.colors <- rep("grey",length(chgs)) %>% setNames(chgs)
  # variant.colors[chgs == 'A>G'] <- '#A1CF64'
  variant.colors[chgs == 'C>A'] <- '#1EBFF0'
  variant.colors[chgs == 'C>T'] <- '#E62725'
  variant.colors[chgs == 'C>G'] <- '#050708'
  variant.colors[chgs == 'T>A'] <- '#CBCACB'
  variant.colors[chgs == 'T>C'] <- '#A1CF64'
  variant.colors[chgs == 'T>G'] <- '#EDC8C5'
  
  # for cutoff line
  # kpAxis_ymax <- log10(max(end(gr[-1]) - start(gr[-length(gr)])))
  kpAxis_ymax <- 8
  cutoff_line_y <- log10(APOBEC_cutoff_values$IMD_cutoff[i]) / kpAxis_ymax
  
  outfilename <- paste0(sample_id,'.rainfall_plot.pdf')
  pdf(file.path(figure_dir,outfilename),width=11)
  # x <- as.grob(expression({
  kp <- plotKaryotype(plot.type=4, ideogram.plotter = NULL,
                      labels.plotter = NULL, plot.params = pp)
  
  # Remove chrY
  kp$chromosomes <- kp$chromosomes %>% setdiff('chrY')
  kp$chromosome.lengths <- kp$chromosome.lengths[kp$chromosomes]
  kp$cytobands <- kp$cytobands[seqnames(kp$cytobands) %in% kp$chromosomes]
  kp$genome <- kp$genome[seqnames(kp$genome) %in% kp$chromosomes]
  kp$plot.region <- kp$plot.region[seqnames(kp$plot.region) %in% kp$chromosomes]
  
  kpAddCytobandsAsLine(kp, lwd=7)
  kpAddChromosomeNames(kp, srt=45)
  kpAddMainTitle(kp, main=sample_id, cex=1.2)
  kpAxis(kp, ymax = kpAxis_ymax, tick.pos = 0:8)
  kpPlotRainfall(kp,
                 data = gr,
                 col=variant.colors,
                 ymin=0,
                 ymax=8)#kpAxis_ymax)
  # kpPlotRainfall(kp, data = gr)
  kpAddLabels(kp, labels = c("Distance between mutations (log10)"), srt=90, pos=1, label.margin = 0.07)
  kpSegments(kp,
             chr=kp$chromosomes,
             x0=0,
             x1=kp$chromosome.length+1E7,
             y0=cutoff_line_y,
             y1=cutoff_line_y,
             col='red',
             clipping=FALSE)
  par(xpd=TRUE)
  legend(x = "bottomright",
         fill = c('#1EBFF0', '#050708', '#E62725', '#CBCACB', '#A1CF64', '#EDC8C5'),
         legend = c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"),
         inset=c(-0.01,0.075))
  
  # kpText(kp, chr="chr1", x=x, y=y, labels='cutoff', col='red')
  # }))
  # grid.newpage()
  # grid.draw(x)
  dev.off()
  
}
# Source:
# [1] https://bernatgel.github.io/karyoploter_tutorial//Examples/Rainfall/Rainfall.html










# Test bed (2022-04-28) --------------------------------------------------------
## flexmix
#
# Not used ---------------------------------------------------------------------
# mixtools
## EM output for data generated from a 2-component model.
# set.seed(100)
# beta <- matrix(c(1, .5, .7, -.8), 2, 2)
# x <- runif(50, 0, 10)
# xbeta <- cbind(1, x)%*%beta
# w <- rbinom(50, 1, .5)
# y <- w*rpois(50, exp(xbeta[, 1]))+(1-w)*rpois(50, exp(xbeta[, 2]))
# out <- poisregmixEM(y, x, verb = TRUE, epsilon = 1e-03)
# out
#
# ------------------------------------------------------------------------------
# A script for finding an IMD for each sample
# ------------------------------------------------------------------------------
# NOTE: The followings have been replaced by using SigProfilerCluster.
# ------------------------------------------------------------------------------
# obsFile_path <- "/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/24_mutect2_strelka2_baseq/merge/clonal/new_filter/A3A_1st_C3_100ng-1.mutect2_strelka2_union.snvs.vcf.seqzcn.scF.bino1-4P.prob.clonal.vcf"
# simDir_path <- "/home/users/jolim/Projects/S04_Yohan_An/02_APOBEC/data/20220424_simulation/00_clustered_mutation/A3A_1st_C3/output/simulations/APOBEC_simulations_GRCh37_96/"
# simFile_paths <- list.files(simDir_path,full=TRUE)
# x <- fread(obsFile_path)
# y <- fread(simFile_path)
# sims <- lapply(1:length(simFile_paths), function(i){
#     y <- fread(simFile_paths[i])
#     y <- y[order(Start_position)][mixedorder(Chrom)][,.(Chrom,POS=Start_position)]
#     y[,run:=i]
# }) %>% do.call(rbind,.)
# sims

# # Sort and subset x
# x <- x[order(POS)][mixedorder(`#CHROM`)][,.(Chrom=`#CHROM`,POS)]
# x_imd <- x[,.(imd=get_imd(POS)),by=Chrom]

# IMD <- 1000
# # sum(x_imd$imd >= IMD)
# sum(x_imd$imd < IMD)
# sims[run==3,.(imd=get_imd(POS)),by=Chrom][,sum(imd<IMD)]

# # Find the IMD that makes 90% of the simulation runs produce at least one cluster.
# IMD <- 1000 # initial value
# increment <- 100
# alpha <- 0.5
# increment_direction <- 0
# fdr <- 0.1
# while (TRUE) {
#     cat('IMD:',IMD)
#     sims_imd <- sims[,.(imd=get_imd(POS)),by=.(run,Chrom)]
#     o <- sum(sims_imd[,sum(imd < IMD) >= 1,by=run]$V1)/100
#     cat(' -> FDR',1-o,'\n')
#     if (o == (1 - fdr)) {
#         break
#     }
#     if (o < (1 - fdr)) {
#         if (increment_direction %in% c(-1,0)) {
#             IMD <- IMD + increment
#         } else if (increment_direction == 1) {
#             IMD <- IMD - increment
#             increment <- increment * alpha
#             increment_direction <- 0
#         }
#     } else if (o > (1 - fdr)) {
#         if (increment_direction %in% c(1,0)) {
#             IMD <- IMD - increment
#         } else if (increment_direction == -1) {
#             IMD <- IMD + increment
#             increment <- increment * alpha
#             increment_direction <- 0
#         }
#     }
#     # Sys.sleep(0.01)
# }