source(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/modules/generic_granges_functions.R"))

dir_pseudogenome <- paste0(Sys.getenv("HOME"), 
  "/code/malawi_transposon/cloud/graph_genome/malawi_haplochromines/pseudoreference"
)

################################################################################

import_chrom_sizes <- function(species) {
  filepath_fai <- glue("{dir_pseudogenome}/fasta/path_{species}.fa.fai")
  chrom_sizes <- readr::read_delim(filepath_fai, delim='\t', col_names=c('chr', 'len'), col_types='ci___')
  chrom_info <- Seqinfo(chrom_sizes$chr, chrom_sizes$len)
  return(chrom_info)
}

import_flex_coord_into_granges <- function(species) {
  # flexible nodes
  filepath_flexnode <- glue("{dir_pseudogenome}/nodes_and_paths/{species}.nodes.flex.bed")
  tmp_df <- readr::read_delim(filepath_flexnode,
    col_type='ciici', col_names =c('chr', 'start', 'end', 'node', 'len') ) %>% 
    filter(len > 0)
  
  # chromosome sizes
  filepath_fai <- glue("{dir_pseudogenome}/fasta/path_{species}.fa.fai")
  chrom_sizes <- readr::read_delim(filepath_fai, delim='\t', col_names=c('chr', 'len'), col_types='ci___')
  chrom_info <- Seqinfo(chrom_sizes$chr, chrom_sizes$len)
  
  # convert into GRanges object
  gr_flexnode <- GRanges(
    seqnames = tmp_df$chr, ranges = IRanges(start = tmp_df$start, end = tmp_df$end),
    node_len = tmp_df$len,node_name = tmp_df$node, seqinfo = chrom_info
  )
  
  return(gr_flexnode)
}

################################################################################

import_transposons_into_granges_pseudo <- function(species) {
  # transposons
  filepath_te_species <- glue("{dir_pseudogenome}/repeatmasker_astCalLib/path_{species}.simple")
  tmp_df <- readr::read_delim(filepath_te_species, delim='\t', col_types='idciiccccccc') %>% 
    mutate(repeat_class_broad = gsub(" ", "", repeat_class_broad)) %>% 
    filter(!repeat_class_broad %in% c('Simple_repeat', 'Low_complexity', 'tRNA', 'snRNA', 'rRNA', 'Satellite')) %>% 
    mutate(repeat_class_broad = if_else(repeat_class_broad == 'RC', 'Helitron', repeat_class_broad)) %>% 
    mutate(repeat_class_broad = if_else(repeat_class_broad == 'SINE?', 'SINE', repeat_class_broad)) %>% 
    arrange(chr, start, end)
  
  # refactor the TE column to have desired order
  tmp_df$repeat_class_broad <- factor(tmp_df$repeat_class_broad, levels = color_schema_te$te_family)
  
  # chromosome sizes
  filepath_fai <- glue("{dir_pseudogenome}/fasta/path_{species}.fa.fai")
  chrom_sizes <- readr::read_delim(filepath_fai, delim='\t', col_names=c('chr', 'len'), col_types='ci___')
  chrom_info <- Seqinfo(chrom_sizes$chr, chrom_sizes$len)
  
  # convert into GRanges object, keeping relevant columns
  gr_transposons <- GenomicRanges::GRanges(
    seqnames = tmp_df$chr,
    ranges = IRanges(start = tmp_df$start, end = tmp_df$end),
    strand = tmp_df$complement, name = tmp_df$repeat_family, score = tmp_df$perc_div,
    repeat_color = tmp_df$color, 
    repeat_class = tmp_df$repeat_class_broad, repeat_subclass = tmp_df$repeat_class,
    repmask_id = tmp_df$repmask_id, te_id = tmp_df$id, split = tmp_df$split,
    species = species, seqinfo = chrom_info
  )
  
  return(gr_transposons)
}

################################################################################

intersect_flexnode_with_te <- function(species) {
  
  gr_a <- gr_flexnode_list[[species]]
  gr_b <- gr_te_list[[species]]
  
  # add the length of the TE as a separate column
  gr_b$te_len <- width(gr_b)

  # do reciprocal intersect to get all info
  tmp_df <- do_reciprocal_intersection(gr_a, gr_b)
  
  # convert to GRanges
  gr_intersect <- GRanges(
    seqnames = tmp_df$seqnames, ranges = IRanges(start = tmp_df$start, end = tmp_df$end),
    strand = tmp_df$strand, node_len = tmp_df$node_len, node_name = tmp_df$node_name,
    te_name = tmp_df$name, te_score = tmp_df$score, te_class = tmp_df$repeat_class, te_subclass = tmp_df$repeat_subclass,  
    te_id = tmp_df$te_id, te_len = tmp_df$te_len, # useful to filter TEs based on age and length later on
    repmask_id = tmp_df$repmask_id, split = tmp_df$split,
    species = species,
    seqinfo = seqinfo(gr_a)
  )
  
  return(gr_intersect)
}
