# Joonoh Lim (joonoh.lim@kaist.ac.kr)
# -----------------------------------
# First created on 2022-04-24
# Last modified on 2022-04-24,25,28
# ------------------------------------------------------------------------------
library(data.table)
library(magrittr)
library(gtools)
library(pbmcapply)
library(BSgenome.Hsapiens.Ensembl.GRCh37.jolim)
library(ggplot2)
library(ggbreak)

library(lemon)
library(scales)
library(flexmix) # flexmix

# ------------------------------------------------------------------------------

# Helper functions -------------------------------------------------------------
get_imd <- function(a) {
  # This computes the intramutation distance between the nearest neighbors.
  l <- data.table::shift(a,-1) - a
  r <- a - data.table::shift(a,1)
  l[length(l)] <- r[length(r)]
  r[1] <- l[1]
  x <- l
  x[r < l] <- r[r < l]
  x
}
# For test,
# a <- x[Chrom=='1',POS]


assign_cluster <- function(pos,imd_cutoff) {
  # This is a time consuming function
  imd_cutoff <- imd_cutoff[1]
  r <- pos - data.table::shift(pos,1) # consecutive imd
  r[1] <- 0
  cluster_assign <- vector('character',length(r))
  cluster_starts <- c(1,which(r - imd_cutoff > 0))
  cluster_ends <- data.table::shift(cluster_starts,-1) - 1
  cluster_ends[length(cluster_ends)] <- length(r)
  cluster_num <- 1
  for (i in 1:length(cluster_starts)) {
    cluster_assign[cluster_starts[i]:cluster_ends[i]] <- cluster_num
    cluster_num <- cluster_num + 1
  }

  return(cluster_assign)
}


classify_substitution_type <- function(imd,vaf,vaf_diff_cutoff) {
  # Types of substitution: sbs, dbs, mbs
  vaf_diff_cutoff <- vaf_diff_cutoff[1]
  subs_type <- vector('character',length(imd))
  imd[abs((mean(vaf) - vaf)) > vaf_diff_cutoff] <- Inf
  imd_1s <- as.integer(imd == 1)
  r <- rle(as.character(imd_1s))$lengths %>% rep(.,times=.)
  r <- r * imd_1s
  subs_type[r <= 1] <- 'sbs'
  subs_type[r == 2] <- 'dbs'
  subs_type[r > 2] <- 'mbs'
  return(subs_type)
}


classify_cluster_type <- function(imd,subs_type,vaf,vaf_diff_cutoff,omikli_upto_n_muts=3) {
  # Types of cluster: dbs, omikli, kataegis
  vaf_diff_cutoff <- vaf_diff_cutoff[1]
  
  # Sanity check
  if (omikli_upto_n_muts < 2) {
    stop("'omikli_upto_n_muts' should be greater than 1.")
  }
  
  # Other
  cluster_type <- vector('character',length(imd))
  idx_for_other <- abs((mean(vaf) - vaf)) > vaf_diff_cutoff
  cluster_type[idx_for_other] <- "other"
  subs_type[idx_for_other] <- "other"
  
  n_sbs <- sum(subs_type == 'sbs')
  n_dbs <- sum(subs_type[cumsum(rle(subs_type)$lengths)] == 'dbs')
  n_mbs <- sum(subs_type[cumsum(rle(subs_type)$lengths)] == 'mbs')
  
  # omikli or kataegis
  n_clustered_muts <- n_sbs + n_dbs + n_mbs
  if (n_clustered_muts == 0) {
    # Do nothing
    # This is the case where all mutations are other
  }
  if (n_clustered_muts == 1) {
    # Case 1: (when ignoring vaf) 1 non-other and it is either dbs or mbs
    if (n_dbs != 0) {
      cluster_type[cluster_type==''] <- 'dbs'
    }
    if (n_mbs != 0) {
      if (all(cluster_type!='')) {
        stop("Something went wrong with 'cluster_type'")
      }
      cluster_type[cluster_type==''] <- 'mbs'
    }
    # Case 2: (considering vaf), 1 non-other and n other.
    if (n_dbs == 0 & n_mbs == 0) {
      cluster_type[cluster_type==''] <- 'other'
    } # 2022-05-01 patched
  } else if (n_clustered_muts <= omikli_upto_n_muts) {
    cluster_type[cluster_type==''] <- 'omikli'
  } else if (n_clustered_muts > omikli_upto_n_muts) {
    cluster_type[cluster_type==''] <- 'kataegis'
  } else {
    stop('Can it ever reach here?')
  }
  return(cluster_type)
}


create_dir <- function(dir) {
  if (dir.exists(dir)) {
    cat('A directory',dir,'already exists.\n')
  } else {
    dir.create(dir,recursive=TRUE)
    cat('A new directory',dir,'has been created.\n')
  }
}


annotate_sequence_context <- function(dt,
                                      n_flanking_bases_5p=1,
                                      n_flanking_bases_3p=1,
                                      chr_name='chr',
                                      pos_name='start',
                                      genome=BSgenome.Hsapiens.Ensembl.GRCh37.jolim){
  gr_5p <- GRanges(seqnames=dt[[chr_name]],ranges=IRanges(start=dt[[pos_name]] - n_flanking_bases_5p,end=dt[[pos_name]] - 1))
  gr_3p <- GRanges(seqnames=dt[[chr_name]],ranges=IRanges(start=dt[[pos_name]] + 1,end=dt[[pos_name]] + n_flanking_bases_3p))
  seqCtx_5p <- getSeq(genome,gr_5p)
  seqCtx_3p <- getSeq(genome,gr_3p)
  dt[,seqCtx_5p:=as.vector(seqCtx_5p)]
  dt[,seqCtx_3p:=as.vector(seqCtx_3p)]
}
# For test,
# dt = out[[1]]

# ------------------------------------------------------------------------------
# NOTE: 2022-04-28,
# Setup
main_dir<-'/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/discovery_simulation/add_simulation/unique_vaf/fin/02_results/v2_fin'
outdir <- file.path(main_dir,'APOBEC_clustered_mutations/annotated')
cluster_result_dir <- file.path(main_dir,'pcawg_simulation_clustered_mutations')

# Load data
input_vcf_filepaths <- list.files(file.path(main_dir,'00_clustered_mutation/input_vcfs_w_ccf_vaf'),full=TRUE)
cluster_result_filepaths <- list.files(cluster_result_dir, pattern='_clustered.txt', full.names=TRUE)
imd_cutoffs<-fread('/home/users/ayh/Projects/27_A3B/01_public_data/pcawg/discovery_simulation/add_simulation/unique_vaf/fin/02_results/v2_fin/imds/APOBEC_clonal_clustered_mutations_imd_cutoff.tsv')
imd_cutoffs$Sample
# Setup
metadata <- imd_cutoffs
metadata <- data.table(Sample=input_vcf_filepaths %>% basename %>% gsub(".vcf$","",.),
                       input_vcf_filepaths=input_vcf_filepaths) %>%
  merge(metadata,.,by='Sample')
metadata <- data.table(Sample=cluster_result_filepaths %>% basename %>% gsub(".pcawg_simulation_clustered.txt$","",.),
                       cluster_result_filepaths=cluster_result_filepaths) %>%
  merge(metadata,.,by='Sample')

# VAF distribution
# lapply(metadata$input_vcf_filepaths, function(f){
#     # f <- metadata$input_vcf_filepaths[1]
#     fread(f)$CCF %>% hist(breaks=1000)
# })

# Paramters
omikli_upto_n_muts <- c(3,4,5)

# vaf considered
# --------------
out <- pbmclapply(1:nrow(metadata), function(i){
  # cat('Processing:',i,'/',nrow(metadata),'\n')#,'\r')
  # NOTE: A3B_1st_C5_100ng_48h_SC-1 doesn't have CCF.
  # i <- 1
  # i <- 6 # which(metadata$Sample == 'A3A_1st_C3_3ug-2')
  # i <- which(metadata$Sample=='A3B_1st_C5_100ng_48h_SC-1') # an exception case
  #i=1
  sample_id <- metadata[i,Sample]
  imd_cutoff <- metadata[i,IMD_cutoff]
  input_vcf <- fread(metadata[i,input_vcf_filepaths])
  input_vcf[,`#CHROM`:=as.character(`#CHROM`)]
  cluster_result <- fread(metadata[i,cluster_result_filepaths],fill=TRUE)
  cluster_result[,chr:=as.character(chr)]
  if (sample_id == 'A3B_1st_C5_100ng_48h_SC-1') {
    # input_vcf <- fread(metadata[Sample=='A3B_1st_C5_100ng_48h_SC-1',input_vcf_filepaths])
    input_vcf[,CCF:=input_vcf[,as.numeric(gsub('.*VAF=([^;|^\t])','\\1',INFO))]]
  }
  cluster_result <- merge(cluster_result,input_vcf[,.(`#CHROM`,POS,VAF)],by.x=c('chr','start'),by.y=c('#CHROM','POS'))
  #setnames(cluster_result,'CCF','VAF')
  dat <- cluster_result
  dat <- dat[order(start)][mixedorder(chr)]
  annotate_sequence_context(dat)
  dat[,imd:=get_imd(start),by=chr]
  dat[,imd_cutoff:=imd_cutoffs[Sample==sample_id,IMD_cutoff]]
  dat[,cluster_id:=assign_cluster(start,imd_cutoff),by=chr] # Time consuming
  dat[,VAF:=as.numeric(VAF)]
  vaf_diffs <- dat[,abs(VAF - data.table::shift(VAF,1))[-1],by=chr]$V1 %>% sort # NOTE: 아포벡 context라면 vaf가 수치상으로 좀 달라도 같은 event일 것 같다. depth 흔들림에 의한 현상일 것으로 생각됨. - Yohan
  # vaf_diff_cutoff <- mean(vaf_diffs)
  vaf_diff_cutoff <- vaf_diffs[floor(length(vaf_diffs)*0.90)]
  # vaf_diff_cutoff <- Inf # To ignore vaf
  # dat[,vaf_diff_cutoff:=vaf_diff_cutoff]
  # Ex.)
  # 9       128508903       APOBEC  A3A_1st_C3_3ug-3        .       GRCh37  SNP     128508903       G       C       SOMATIC 2466126 163             0.833333333333  163     7218    7       other   other   other
  # 9       128509066       APOBEC  A3A_1st_C3_3ug-3        .       GRCh37  SNP     128509066       G       C       SOMATIC 163     163             1.11627906977   163     7218    7       other   other   other
  #
  # 0.833333333333 vs. 1.11627906977 -> DP각각 49, 61
  #
  dat[,subs_type:=classify_substitution_type(imd,VAF,vaf_diff_cutoff),by=c('chr','cluster_id')]
  for (omikli_cutoff in omikli_upto_n_muts) {
    dat[,cluster_type:=classify_cluster_type(imd,subs_type,VAF,vaf_diff_cutoff,omikli_upto_n_muts=omikli_cutoff),by=c('chr','cluster_id')]
    setnames(dat,'cluster_type',paste0('cluster_type_omikli_upto_',omikli_cutoff))
  }
  setnames(dat,'VAF','CCF_or_VAF')
  dat
},mc.cores=15) %>% setNames(metadata$Sample)
out_all_in_one <- out %>% do.call(rbind,.)

# Save to files
outdir_vaf_considered <- file.path(outdir,'vaf_considered')
create_dir(outdir_vaf_considered)
for (i in 1:length(out)) {
  outfile <- file.path(outdir_vaf_considered,paste0(names(out[i]),'.APOBEC_clustered.annotated.tsv'))
  fwrite(out[[i]],outfile,sep='\t')
}
all_outfile <- file.path(outdir_vaf_considered,'APOBEC_clustered_mutations.all_samples.annotated.tsv')
fwrite(out_all_in_one,all_outfile,sep='\t')
