library(ggplot2)
library(tibble)
library(dplyr)
library(tidyr)
library(gprofiler2)

setwd(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/"))
source("modules/custom_gene_ontology.R")

################################################################################
################################################################################

# astCal private set
df_private_astCal <- readr::read_csv("./local/gene_ontology/geneList_markedlyAbsent-astCal.csv")
df_orthology_astCal <- readr::read_delim("../../metadata/ortholog_calliptera_to_mbuna.txt")

df_compared_astCal <- left_join(
    select(df_private_astCal, gene,),
    df_orthology_astCal %>% 
      select(gene = `Gene stable ID`, ortholog = `Zebra mbuna gene stable ID`,
             astCal_scaffold = `Chromosome/scaffold name`, 
             astCal_start = `Gene start (bp)`, 
             astCal_end = `Gene end (bp)`,
             mayZeb_scaffold = `Zebra mbuna chromosome/scaffold name`, 
             mayZeb_start = `Zebra mbuna chromosome/scaffold start (bp)`, 
             mayZeb_end = `Zebra mbuna chromosome/scaffold end (bp)`)
  ) %>%
  rowwise() %>% 
  mutate(
    astCal_coord = glue(astCal_scaffold, ':', astCal_start, '-', astCal_end),
    mayZeb_coord = glue(mayZeb_scaffold, ':', mayZeb_start, '-', mayZeb_end)
  ) %>% 
  mutate(mayZeb_coord = if_else(is.na(ortholog), NA, mayZeb_coord)) %>% 
  select(gene, astCal_coord, ortholog, mayZeb_coord) %>% 
  left_join(df_private_astCal)
  
df_compared_astCal %>% arrange(ortholog, gene) %>% 
  write_csv(
    paste0(Sys.getenv("HOME"), "/code/malawi_transposon/storage/local/script/gene_ontology/geneListFilter_markedlyAbsent-astCal.csv")
  )

# how many private genes, how many have descriptions
df_compared_astCal %>% filter(is.na(ortholog)) %>% .$gene %>% unique() %>% length()
df_compared_astCal %>% filter(is.na(ortholog)) %>% select(gene, gene_description) %>% unique() %>% filter(!is.na(gene_description)) %>% nrow()

# how many artifact genes, how many have descriptions
df_compared_astCal %>% filter(!is.na(ortholog)) %>% .$gene %>% unique() %>% length()
df_compared_astCal %>% filter(!is.na(ortholog)) %>% select(gene, gene_description) %>% unique() %>% filter(!is.na(gene_description)) %>% nrow()


################################################################################
################################################################################
# mayZeb private set
df_private_mayZeb <- readr::read_csv("./local/gene_ontology/geneList_markedlyAbsent-mayZeb.csv")
df_orthology_mayZeb <- readr::read_delim("../../metadata/ortholog_mbuna_to_calliptera.txt")

df_compared_mayZeb <- left_join(
  select(df_private_mayZeb, gene,),
  df_orthology_mayZeb %>% 
    select(gene = `Gene stable ID`, ortholog = `Eastern happy gene stable ID`,
           mayZeb_scaffold = `Chromosome/scaffold name`, 
           mayZeb_start = `Gene start (bp)`, 
           mayZeb_end = `Gene end (bp)`,
           astCal_scaffold = `Eastern happy chromosome/scaffold name`, 
           astCal_start = `Eastern happy chromosome/scaffold start (bp)`, 
           astCal_end = `Eastern happy chromosome/scaffold end (bp)`)
  ) %>%
  rowwise() %>% 
  mutate(
    mayZeb_coord = glue(mayZeb_scaffold, ':', mayZeb_start, '-', mayZeb_end),
    astCal_coord = glue(astCal_scaffold, ':', astCal_start, '-', astCal_end)
  ) %>% 
  mutate(astCal_coord = if_else(is.na(ortholog), NA, astCal_coord)) %>% 
  select(gene, mayZeb_coord, ortholog, astCal_coord) %>% 
  left_join(df_private_mayZeb)

df_compared_mayZeb %>% arrange(ortholog, gene) %>% 
  write_csv(
    paste0(Sys.getenv("HOME"), "/code/malawi_transposon/storage/local/script/gene_ontology/geneListFilter_markedlyAbsent-mayZeb.csv")
  )

# how many private genes, how many have descriptions
df_compared_mayZeb %>% filter(is.na(ortholog)) %>% .$gene %>% unique() %>% length()
df_compared_mayZeb %>% filter(is.na(ortholog)) %>% select(gene, gene_description) %>% unique() %>% filter(!is.na(gene_description)) %>% nrow()

# how many artifact genes, how many have descriptions
df_compared_mayZeb %>% filter(!is.na(ortholog)) %>% .$gene %>% unique() %>% length()
df_compared_mayZeb %>% filter(!is.na(ortholog)) %>% select(gene, gene_description) %>% unique() %>% filter(!is.na(gene_description)) %>% nrow()

################################################################################
################################################################################
# compare how well the universal list agrees with each other

universal_astCal_geneList <- readLines("./local/gene_ontology/geneList_Universal-astCal.txt")
glue(
  "{has_match} matched out of {list_length} ({round(has_match/list_length * 100, 2)}%)",
  has_match = sum(universal_astCal_geneList %in% df_orthology_mayZeb$`Eastern happy gene stable ID`),
  list_length = length(universal_astCal_geneList)
)

universal_mayZeb_geneList <- readLines("./local/gene_ontology/geneList_Universal-mayZeb.txt")
glue(
  "{has_match} matched out of {list_length} ({round(has_match/list_length * 100, 2)}%)",
  has_match = sum(universal_mayZeb_geneList %in% df_orthology_astCal$`Zebra mbuna gene stable ID`),
  list_length = length(universal_mayZeb_geneList)
)

################################################################################
################################################################################
# GENE ONTOLOGY ANALYSIS

# gene ontology for astCal true private
gene_list <- filter(df_compared_astCal, is.na(ortholog)) %>% .$gene %>% unique()
go_result <- run_GO_analysis(gene_list, "acalliptera")
go_result$result$query <- ""
highlighted_terms <- go_result$result %>% filter(highlighted == TRUE) %>% .$term_id

p <- gostplot(go_result, capped = TRUE, interactive = FALSE)
pp <- publish_gostplot_custom(p, highlight_terms = highlighted_terms)
ggsave(filename = "./local/gene_ontology_recode/gProfiler_geneListFilter_markedlyAbsent-astCal.pdf", pp, width = 9, height = 2.8)
publish_gosttable(go_result, highlight_terms = highlighted_terms, use_colors = TRUE, 
  filename = "./local/gene_ontology_recode/gProfiler_geneListFilter_markedlyAbsent-astCal-table.pdf")


# gene ontology for astCal artifacts
gene_list <- filter(df_compared_astCal, !is.na(ortholog)) %>% .$gene %>% unique()
go_result <- run_GO_analysis(gene_list, "acalliptera")
go_result$result$query <- ""
highlighted_terms <- go_result$result %>% filter(highlighted == TRUE) %>% .$term_id

p <- gostplot(go_result, capped = TRUE, interactive = FALSE)
pp <- publish_gostplot_custom(p, highlight_terms = highlighted_terms)
ggsave(filename = "./local/gene_ontology_recode/gProfiler_geneListFilter_markedlyArtifact-astCal.pdf", pp, width = 9, height = 2.8)
publish_gosttable(go_result, highlight_terms = highlighted_terms, use_colors = TRUE, 
  filename = "./local/gene_ontology_recode/gProfiler_geneListFilter_markedlyArtifact-astCal-table.pdf")


# gene ontology for mayZeb true private
gene_list <- filter(df_compared_mayZeb, is.na(ortholog)) %>% .$gene %>% unique()
go_result <- run_GO_analysis(gene_list, "mzebra")
go_result$result$query <- ""
highlighted_terms <- go_result$result %>% filter(highlighted == TRUE) %>% .$term_id

p <- gostplot(go_result, capped = TRUE, interactive = FALSE)
pp <- publish_gostplot_custom(p, highlight_terms = highlighted_terms)
ggsave(filename = "./local/gene_ontology_recode/gProfiler_geneListFilter_markedlyAbsent-mayZeb.pdf", pp, width = 9, height = 2.8)
publish_gosttable(go_result, highlight_terms = highlighted_terms, use_colors = TRUE, 
  filename = "./local/gene_ontology_recode/gProfiler_geneListFilter_markedlyAbsent-mayZeb-table.pdf")


# gene ontology for mayZeb artifacts
gene_list <- filter(df_compared_mayZeb, !is.na(ortholog)) %>% .$gene %>% unique()
go_result <- run_GO_analysis(gene_list, "mzebra")
go_result$result$query <- ""
highlighted_terms <- go_result$result %>% filter(highlighted == TRUE) %>% .$term_id

p <- gostplot(go_result, capped = TRUE, interactive = FALSE)
pp <- publish_gostplot_custom(p, highlight_terms = highlighted_terms)
ggsave(filename = "./local/gene_ontology_recode/gProfiler_geneListFilter_markedlyArtifact-mayZeb.pdf", pp, width = 9, height = 2.8)
publish_gosttable(go_result, highlight_terms = highlighted_terms, use_colors = TRUE, 
  filename = "./local/gene_ontology_recode/gProfiler_geneListFilter_markedlyArtifact-mayZeb-table.pdf")
