# Draft code to link SVs with TEs (to be deprecated)

library(AnnotationDbi)
library(GenomicRanges)
library(ggplot2)
library(tibble)
library(dplyr)
library(tidyr)
library(readr)
library(glue)

setwd(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/"))

source("modules/get_chrom_sizes.R")
source("modules/misannotation_thresholds.R")

# astCal version
subdir <- "malawi_haplochromines"
bb_name <- "astCal1.2_ensembl"
bb_name_sqlite <- "astCal_v1.2"
bb_name_abrv <- "astCal"

################################################################################

# load and gene descriptions
gene_descriptions <- readr::read_csv(
  glue(Sys.getenv("HOME"), "/code/malawi_transposon/cloud/genome/{bb_name}/gene_description.csv"),
  skip = 1, col_names = c("gene", "gene_description", "gene_name") ) %>% 
  mutate(gene_description = gsub("\\[Source:.*?\\]", "", gene_description))

# read sv dataframe
df_sv <- readr::read_csv(
  glue("./local/{subdir}-summary-preprocessed.csv"),
  col_types = paste0('cciiiiicciiicilliidc', paste0(rep('ciicic', num_of_genomes), collapse=''), collapse=''),
  na = c("", "NA", ".")
)

# create GRanges for the structural variants
gr_sv <- GRanges(
  seqnames = df_sv$chr, 
  ranges = IRanges(start = df_sv$start, end = df_sv$end), id = df_sv$id, 
  seqinfo = Seqinfo(chrom_sizes_list[[bb_name_abrv]][[1]], chrom_sizes_list[[bb_name_abrv]][[2]]) 
)

################################################################################

# get gene GRanges
path_to_annotation_sqlite <- glue(
  Sys.getenv("HOME"), "/igv/genomes/{bb_name}/{bb_name_sqlite}.gff.sqlite")
txdb <- AnnotationDbi::loadDb(file = path_to_annotation_sqlite)
gr_gene <- genes(txdb, columns = c('gene_id', 'tx_name'))

# add flanking regions to the genes
gr_gene_flanked <- resize(gr_gene, width = width(gr_gene) + 2000, fix="start") %>% resize(width = width(.) + 2000, fix="end") %>% trim()

################################################################################

# create GRanges for the start of the genes
df_gene <- as.data.frame(gr_gene) %>% 
  as_tibble() %>% 
  dplyr::select(gene_id, seqnames, start, end, strand) %>% 
  mutate(
    true_start = if_else(strand == '+', start, end),
    true_end = if_else(strand == '+', end, start),
    gene_length = abs(end - start)
  ) %>%
  rowwise() %>% mutate(inner_flank_size = min(as.integer(0.5*gene_length), 2000)) %>% 
  ungroup() %>% 
  mutate(
    flank_start = if_else(strand == '+', start - 2000, end - inner_flank_size),
    flank_end = if_else(strand == '+', start + inner_flank_size, end + 2000)
  )

gr_geneStartRegion <- GRanges(
  seqnames = df_gene$seqnames,
  ranges = IRanges(start = df_gene$flank_start, end = df_gene$flank_end),
  strand = df_gene$strand, gene_id = df_gene$gene_id
)

################################################################################

# intersect each SV with each gene
hits <- findOverlaps(gr_sv, gr_gene_flanked)

# get the list of bubbles and genes they are within 5000bp from
# then add some columns about the bubble properties
df_hits <- tibble(bubble = gr_sv$id[hits@from], gene = gr_gene_flanked$gene_id[hits@to] ) %>% 
  left_join(gene_descriptions) %>% 
  left_join(
    select(df_sv, 
      bubble = id, n_segments, n_paths, n_paths_bio,
      len_min, len_max, lenbio_min, lenbio_max, ends_with("_len")
    )
  )

length(unique(df_hits$gene))

  
# filter out bubbles that are likely to be clear cut transposon insertions
df_hits_filter <- df_hits %>% 
  filter(n_paths_bio <= 3, n_segments < 5, len_min < 100, len_max > 1000) 

length(unique(df_hits_filter$gene))

writeLines(unique(df_hits_filter$gene), "./local/gene_ontology_TE/likely_hits.txt")

################################################################################

# intersect each SV with each gene's start region
hits <- findOverlaps(gr_sv, gr_geneStartRegion)

df_hits <- tibble(bubble = gr_sv$id[hits@from], gene = gr_geneStartRegion$gene_id[hits@to] ) %>% 
  left_join(gene_descriptions) %>% 
  left_join(
    dplyr::select(df_sv, 
      bubble = id, n_segments, n_paths, n_paths_bio,
      len_min, len_max, lenbio_min, lenbio_max, ends_with("_len")
    )
  )

length(unique(df_hits$gene))

# filter out bubbles that are likely to be clear cut transposon insertions
df_hits_filter <- df_hits %>% 
  filter(n_paths_bio <= 3, n_segments < 5, len_min < 100, len_max > 1000) 

writeLines(unique(df_hits_filter$gene), "./local/gene_ontology_TE/likely_hits-nearStart.txt")


