# script to identify gene annotations that overlap significantly with TEs,
# making them most likely false annotations

library(GenomicFeatures)
library(AnnotationDbi)
library(dplyr)
library(readr)
library(glue)
library(tidyr)

# import modules
source(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/modules/get_chrom_sizes.R"))
source(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/modules/import_transposons_into_granges.R"))

# comment out as required
species_variables <- c(
  "malawi_haplochromines", "astCal1.2_ensembl", "astCal_v1.2", "astCal"
  # "malawi_haplochromines_zebra", "mayZeb2.0_ensembl", "mayZeb_v2.0", "mayZeb"
)

subdir <- species_variables[1]
bb_name <- species_variables[2]
bb_name_sqlite <- species_variables[3]
bb_name_abrv <- species_variables[4]

################################################################################

# extract gene coordinates
setwd(glue(Sys.getenv("HOME"), "/code/malawi_transposon/cloud/genome/{bb_name}/") )
txdb <- AnnotationDbi::loadDb(file = glue("{bb_name_sqlite}.gff.sqlite") )
gr_gene <- genes(txdb, columns = c('gene_id', 'tx_name'))

# load gene descriptions
gene_descriptions <- readr::read_csv("gene_description.csv")
colnames(gene_descriptions) <- c("gene_id", "gene_description", "gene_name")
glue("{x} ({y}%) out of {nrow(gene_descriptions)} genes in {bb_name_abrv} have descriptions on them",
  x = sum(!is.na(gene_descriptions$gene_description)),
  y = round(x / nrow(gene_descriptions) * 100, 2)
)

# get coordinates for TE
gr_te_raw <- import_transposons_into_granges(bb_name_abrv)

# filter for known TEs only and below 20% divergence 
# don't want to be too strict and omit genes
gr_te <- gr_te_raw[gr_te_raw$repeat_class != "Unknown" & gr_te_raw$score < 20]

################################################################################

# compute overlap with single TEs
hits <- findOverlaps(gr_te, gr_gene, ignore.strand=TRUE)
overlaps <- pintersect(gr_te[queryHits(hits)], gr_gene[subjectHits(hits)], ignore.strand=TRUE)
percentOverlap <- width(overlaps) / width(gr_gene[subjectHits(hits)])

df_maxOverlapSingleTE <- tibble(
  gene_id = gr_gene[subjectHits(hits)]$gene_id,
  overlapSingleTE = percentOverlap) %>% 
  group_by(gene_id) %>% 
  summarise(maxOverlapSingleTE = max(overlapSingleTE))

################################################################################

# compute overlap with multiple TE
# for this, we need to reduce the TE GRanges into disjoint ranges
gr_te_disjoint <- GenomicRanges::reduce(gr_te, ignore.strand=TRUE)
hits <- findOverlaps(gr_te_disjoint, gr_gene, ignore.strand=TRUE)
overlaps <- pintersect(gr_te_disjoint[queryHits(hits)], gr_gene[subjectHits(hits)], ignore.strand=TRUE)
percentOverlap <- width(overlaps) / width(gr_gene[subjectHits(hits)])

df_totalOverlapMultipleTE <- tibble(
  gene_id = gr_gene[subjectHits(hits)]$gene_id,
  overlapSingleTEpart = percentOverlap) %>% 
  group_by(gene_id) %>% 
  summarise(totalOverlapMultipleTE = sum(overlapSingleTEpart))

################################################################################

# combine all the info into a master dataframe for saving
df_master <- gene_descriptions %>% 
  mutate(hasGeneDescription = !is.na(gene_description)) %>% 
  select(gene_id, hasGeneDescription) %>% 
  left_join(df_maxOverlapSingleTE) %>% 
  left_join(df_totalOverlapMultipleTE) %>% 
  replace_na(list(maxOverlapSingleTE = 0, totalOverlapMultipleTE = 0))

################################################################################

# mark genes that are annotated with "transposition" GO term
geneListTransposition <- readLines(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/metadata/biomart_transposition-astCal.txt"))

df_master <- df_master %>% mutate(hasTranspositionTerm = gene_id %in% geneListTransposition) 

################################################################################

# write file
df_master %>% write_csv("gene_trueness.csv")

# set that we will include
df_master %>% filter(hasGeneDescription == TRUE | (maxOverlapSingleTE < 0.5 & totalOverlapMultipleTE < 0.8)) %>% View()

# genes with no description that strongly overlap with TEs
df_master %>% filter(hasGeneDescription == FALSE & (maxOverlapSingleTE > 0.5 | totalOverlapMultipleTE > 0.8)) %>% View()

# genes with description but strongly overlapping with TEs (curious case)
df_master %>% filter(hasGeneDescription == TRUE & (maxOverlapSingleTE > 0.5 | totalOverlapMultipleTE > 0.8)) %>% View()


