# We perform Gene Ontology for sites with TE insertions
# Removing false genes we have identified thoroughout the analysis

library(AnnotationDbi)
library(GenomicRanges)
library(ggplot2)
library(tibble)
library(dplyr)
library(tidyr)
library(readr)
library(glue)
library(gprofiler2)

setwd(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/"))

source("modules/get_chrom_sizes.R")
source("modules/misannotation_thresholds.R")
source("modules/custom_gene_ontology.R")

# astCal version
subdir <- "malawi_haplochromines"
bb_name <- "astCal1.2_ensembl"
bb_name_sqlite <- "astCal_v1.2"
bb_name_abrv <- "astCal"

################################################################################

# load gene annotations
path_to_annotation_sqlite <- glue(
  Sys.getenv("HOME"), "/igv/genomes/astCal1.2_ensembl/astCal_v1.2.gff.sqlite")
txdb <- AnnotationDbi::loadDb(file = path_to_annotation_sqlite)
gr_gene <- genes(txdb, columns = c('gene_id', 'tx_name'))

# create GRanges representing gene region, with 2000bp added upstream
gr_geneFlanked <- gr_gene %>% resize(width = width(gr_gene) + 2000, fix = 'end') %>% trim()

# create GRanges representing upstream region + 2000bp downstream of start (or half of the gene)
df_gene <- as.data.frame(gr_gene) %>% 
  as_tibble() %>% 
  dplyr::select(gene_id, seqnames, start, end, strand) %>% 
  mutate(
    true_start = if_else(strand == '+', start, end),
    true_end = if_else(strand == '+', end, start),
    gene_length = abs(end - start)
  ) %>%
  rowwise() %>% mutate(inner_flank_size = min(as.integer(0.5*gene_length), 2000)) %>% 
  ungroup() %>% 
  mutate(
    flank_start = if_else(strand == '+', start - 2000, end - inner_flank_size),
    flank_end = if_else(strand == '+', start + inner_flank_size, end + 2000)
  )

gr_geneStartRegion <- GRanges(
  seqnames = df_gene$seqnames,
  ranges = IRanges(start = df_gene$flank_start, end = df_gene$flank_end),
  strand = df_gene$strand, gene_id = df_gene$gene_id
)

################################################################################

# compile a list of genes to be removed

# remove genes falsely annotated as TEs
df_gene_trueness <- readr::read_csv(
  glue(Sys.getenv("HOME"), "/code/malawi_transposon/cloud/genome/{bb_name}/gene_trueness.csv")
)
tmp1 <- df_gene_trueness %>% filter(
  (maxOverlapSingleTE > threshGeneBodyOverlapSingleTE | totalOverlapMultipleTE > threshGeneBodyOverlapMultipleTE) ) %>% 
  .$gene_id

# remove unreliable annotations from orthology (unreliable annotations)
df_orthology <- readr::read_csv(
  glue(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/local/gene_ontology/geneListFilter_markedlyAbsent-{bb_name_abrv}.csv")
)

tmp2 <- df_orthology %>% filter(is.na(ortholog)) %>% .$gene

# remove those with TE gene ontology terms?
tmp3 <- readLines("./local/gene_ontology_TE/biomart_transposition.txt")
tmp3 <- unlist(strsplit(tmp3, ','))

genes_to_ignore <- unique(c(tmp1, tmp2, tmp3))
length(genes_to_ignore)

gr_geneFilter <- gr_gene[!gr_gene$gene_id %in% genes_to_ignore]
gr_geneStartRegionFilter <- gr_geneStartRegion[!gr_geneStartRegion$gene_id %in% genes_to_ignore]
gr_geneFlankedFilter <- gr_geneFlanked[!gr_geneFlanked$gene_id %in% genes_to_ignore]

################################################################################

# get sv dataframe, import as GRanges
df_intersect_filter <- read_csv(
  glue(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/local/malawi_haplochromines-te-intersects.csv")
)

tmp_df_gr_sv <- df_intersect_filter %>% 
  select(bubble_id, repeat_class, repeat_subclass) %>% 
  separate(bubble_id, into = c("chr", "start", "end"), sep = "[:-]", remove=FALSE) %>% 
  mutate(start = as.integer(start), end = as.integer(end)) %>% 
  distinct()

gr_sv_polyTE <- GRanges(
  seqnames = tmp_df_gr_sv$chr, ranges = IRanges(start=tmp_df_gr_sv$start, end=tmp_df_gr_sv$end),
  bubble_id = tmp_df_gr_sv$bubble_id, repeat_class = tmp_df_gr_sv$repeat_class, repeat_subclass = tmp_df_gr_sv$repeat_subclass
)  

################################################################################

# intersect polymorphic TE bubbles with gene start regions
hits <- findOverlaps(gr_sv_polyTE, gr_geneStartRegionFilter)

df_hits <- bind_cols(
    as.data.frame(gr_sv_polyTE)[hits@from,], 
    tibble(gene = gr_geneStartRegion$gene_id[hits@to]) ) %>% 
  distinct()

# number of unique genes with polymorphic TE insertion
length(unique(df_hits$gene))

# gene ontology
#tmp_gene <- df_hits %>% filter(repeat_subclass == 'DNA/TcMar') %>% .$gene %>% unique()
#tmp_gene <- df_hits %>% filter(repeat_class == 'DNA/TcMar') %>% .$gene %>% unique()
tmp_gene <- df_hits %>% .$gene %>% unique()
go_result <- run_GO_analysis(unique(tmp_gene), "acalliptera", 0.1, FALSE)

View(go_result$result)

writeLines(tmp_gene, "./local/gene_ontology_TE/geneList_polymorphic-TE_StartRegion.txt")

################################################################################

# intersect polymorphic TE bubbles with gene start regions
hits <- findOverlaps(gr_sv_polyTE, gr_geneFlankedFilter)

df_hits <- bind_cols(
  as.data.frame(gr_sv_polyTE)[hits@from,], 
  tibble(gene = gr_geneStartRegion$gene_id[hits@to]) ) %>% 
  distinct()

# number of unique genes with polymorphic TE insertion
length(unique(df_hits$gene))

# gene ontology
#tmp_gene <- df_hits %>% filter(repeat_subclass == 'DNA/TcMar') %>% .$gene %>% unique()
#tmp_gene <- df_hits %>% filter(repeat_class == 'DNA/TcMar') %>% .$gene %>% unique()
tmp_gene <- df_hits %>% .$gene %>% unique()
go_result <- run_GO_analysis(unique(tmp_gene), "acalliptera", 0.05, FALSE)

View(go_result$result)

writeLines(tmp_gene, "./local/gene_ontology_TE/geneList_polymorphic-TE_Flanked.txt")

################################################################################
################################################################################
# IMPORT ALL THE SV COORDINATES

chrom_sizes <- readr::read_delim(
  glue(Sys.getenv("HOME"), "/igv/genomes/{bb_name}/chrom.sizes"),
  col_names = c('chr', 'len')
)
chrom_sizes <- Seqinfo(chrom_sizes$chr, chrom_sizes$len)
bb_size <- sum(seqlengths(chrom_sizes))

df_sv <- read_csv(
  "./local/malawi_haplochromines-summary-preprocessed.csv",
  col_types = paste0('cciiiiicciiicilliidc', paste0(rep('ciicic', num_of_genomes), collapse=''), collapse=''),
  na = c("", "NA", ".")) %>% 
  filter(n_paths_bio <= 3, lenbio_min < 10, lenbio_minmaxratio <= 0.1)

gr_sv <- GRanges(
  seqnames = df_sv$chr, 
  ranges = IRanges(start = df_sv$start, end = df_sv$end), id = df_sv$id, 
  seqinfo = chrom_sizes
)

# get differentially expressed gene list
list_of_de_genes <- readLines("./local/gene_ontology/diffExp-testes.txt")

################################################################################

library(VennDiagram)
library(RColorBrewer)

# INTERSECTION: GENE BODY

# genes intersecting with SV only
idxA <- findOverlaps(gr_geneFlankedFilter, gr_sv, ignore.strand=TRUE)
setA <- unique(gr_geneFlankedFilter[idxA@from]$gene_id)

# genes intersecting with polymorphic TE site
idxB <- findOverlaps(gr_geneFlankedFilter, gr_sv_polyTE, ignore.strand=TRUE)
setB <- unique(gr_geneFlankedFilter[idxB@from]$gene_id)

futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")
grid.newpage()
venn_object <- venn.diagram(
  x = list(setA, setB, list_of_de_genes),
  category.names = c("with SVs" , "with polymorphic TEs", "diffExp"),
  total.population = length(gr_gene),
  output=FALSE, filename=NULL,
  fill = brewer.pal(3, "Pastel2")
)
grid.draw(venn_object)

goList_DiffExp_noSVnearby <- gr_geneFlankedFilter$gene_id[!gr_geneFlankedFilter$gene_id %in% setA & gr_geneFlankedFilter$gene_id %in% list_of_de_genes]
goList_DiffExp_withSVnearby <- setA[setA %in% list_of_de_genes]


# INTERSECTION: UPSTREAM REGION

# genes intersecting with SV only
idxA <- findOverlaps(gr_geneStartRegionFilter, gr_sv, ignore.strand=TRUE)
setA <- unique(gr_geneStartRegionFilter[idxA@from]$gene_id)

# genes intersecting with polymorphic TE site
idxB <- findOverlaps(gr_geneStartRegionFilter, gr_sv_polyTE, ignore.strand=TRUE)
setB <- unique(gr_geneStartRegionFilter[idxB@from]$gene_id)

futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")
grid.newpage()
venn_object <- venn.diagram(
  x = list(setA, setB, list_of_de_genes),
  category.names = c("with SVs" , "with polymorphic TEs", "diffExp"),
  total.population = length(gr_gene),
  output=FALSE, filename=NULL,
  fill = brewer.pal(3, "Pastel2")
)
grid.draw(venn_object)

goList_DiffExp_withSVupstream <- setA[setA %in% list_of_de_genes]
goList_DiffExp_withPolyTEupstream <- setB[setB %in% list_of_de_genes]

####

length(goList_DiffExp_noSVnearby)
go_result <- run_GO_analysis(goList_DiffExp_noSVnearby, "acalliptera")
go_result$result$query <- ""
go_result$result$term_name[go_result$result$term_id == "GO:0042283"] <- "dolichyl pyrophosphate alpha 1,3-glucosyltransferase"
highlighted_terms <- go_result$result %>% filter(highlighted == TRUE) %>% .$term_id
publish_gosttable(go_result, highlight_terms = highlighted_terms, use_colors = TRUE,
                  filename = "./local/gene_ontology_recode/goList_DiffExp_noSVnearby.pdf")

length(goList_DiffExp_withSVnearby)
go_result <- run_GO_analysis(goList_DiffExp_withSVnearby, "acalliptera")
go_result$result$query <- ""
highlighted_terms <- go_result$result %>% filter(highlighted == TRUE) %>% .$term_id
publish_gosttable(go_result, highlight_terms = highlighted_terms, use_colors = TRUE,
                  filename = "./local/gene_ontology_recode/goList_DiffExp_withSVnearby.pdf")

length(goList_DiffExp_withSVupstream)
go_result <- run_GO_analysis(goList_DiffExp_withSVupstream, "acalliptera")
go_result$result$query <- ""
highlighted_terms <- go_result$result %>% filter(highlighted == TRUE) %>% .$term_id
publish_gosttable(go_result, highlight_terms = highlighted_terms, use_colors = TRUE,
                  filename = "./local/gene_ontology_recode/goList_DiffExp_withSVupstream.pdf")

length(goList_DiffExp_withPolyTEupstream)
go_result <- run_GO_analysis(goList_DiffExp_withPolyTEupstream, "acalliptera")
go_result$result$query <- ""
highlighted_terms <- go_result$result %>% filter(highlighted == TRUE) %>% .$term_id
publish_gosttable(go_result, highlight_terms = highlighted_terms, use_colors = TRUE,
                  filename = "./local/gene_ontology_recode/goList_DiffExp_withPolyTEupstream.pdf")


