library(AnnotationDbi)
library(GenomicRanges)
library(tibble)
library(dplyr)
library(tidyr)
library(readr)
library(glue)

setwd(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/"))


################################################################################
################################################################################
# ASTCAL

# genes private to backbone
df_private_astCal <- readr::read_csv("./local/gene_ontology/geneList_markedlyAbsent-astCal.csv")

# get all genes for the backbone
txdb_astCal <- AnnotationDbi::loadDb(
  file = glue(Sys.getenv("HOME"), "/igv/genomes/astCal1.2_ensembl/astCal_v1.2.gff.sqlite")
)
df_all_genes_astCal <- genes(txdb_astCal) %>% as.data.frame() %>% as_tibble()

# get all genes and their peptides for the backbone
df_all_peptide_astCal <- readr::read_csv("./local/tblastn/gene_transcript_peptide-AC.csv")

# get tblastn results for the private genes
df_tblastn_astCal <- readr::read_tsv("./local/tblastn/tblastn_ACquery_vs_MZ.tsv", comment='#',
  col_names = unlist(strsplit("query,subject_chr,perc_identity,align_len,mismatch,gaps,q_start,q_end,s_start,s_end,evalue,bit_score", ','))
) 
df_tblastn_astCal$query <- gsub('\\.1$', '', df_tblastn_astCal$query)


#all(df_private_astCal$gene %in% df_gene_transcript_peptide_astCal$`Gene stable ID`)


tmp <- left_join(
  df_all_peptide_astCal %>% 
    filter(`Gene stable ID` %in% df_private_astCal$gene) %>% 
    select(peptide_id = `Protein stable ID`, gene_id = `Gene stable ID`) %>% 
    distinct(),
  df_all_genes_astCal %>% select(gene_id, gene_chr=seqnames )
)
tmp$gene_chr <- as.vector(tmp$gene_chr)
tmp <- left_join(tmp,
  df_tblastn_astCal %>% select(peptide_id = query, subject_chr, perc_identity, evalue)) %>% 
  mutate(subject_chr_parsed = gsub("LG", "", subject_chr))

glue("number of private genes: {length(unique(tmp$gene_id))}")
glue(
  "number of private genes not on scaffolds: {x}",
  x = tmp %>% filter(!startsWith(gene_chr, "LS")) %>% .$gene_id %>% unique() %>% length()
)

# not on scaffold, match the exact same chromosome, 80% identity
tmp %>% 
  filter(!startsWith(gene_chr, "LS")) %>% 
  filter(gene_chr == subject_chr_parsed, perc_identity > 80) %>% 
  .$gene_id %>% unique() %>% length()

# not on scaffold, match the exact same chromosome, 80% identity
tmp %>% 
  filter(!startsWith(gene_chr, "LS")) %>% 
  filter(perc_identity > 80) %>% 
  .$gene_id %>% unique() %>% length()


################################################################################
################################################################################
# MAYZEB

# genes private to backbone
df_private_mayZeb <- readr::read_csv("./local/gene_ontology/geneList_markedlyAbsent-mayZeb.csv")

# get all genes for the backbone
txdb_mayZeb <- AnnotationDbi::loadDb(
  file = glue(Sys.getenv("HOME"), "/igv/genomes/mayZeb2.0_ensembl/mayZeb_v2.0.gff.sqlite")
)
df_all_genes_mayZeb <- genes(txdb_mayZeb) %>% as.data.frame() %>% as_tibble()

# get all genes and their peptides for the backbone
df_all_peptide_mayZeb <- readr::read_csv("./local/tblastn/gene_transcript_peptide-MZ.csv")

# get tblastn results for the private genes
df_tblastn_mayZeb <- readr::read_tsv("./local/tblastn/tblastn_MZquery_vs_AC.tsv", comment='#',
                                     col_names = unlist(strsplit("query,subject_chr,perc_identity,align_len,mismatch,gaps,q_start,q_end,s_start,s_end,evalue,bit_score", ','))
) 
df_tblastn_mayZeb$query <- gsub('\\.1$', '', df_tblastn_mayZeb$query)


#all(df_private_mayZeb$gene %in% df_gene_transcript_peptide_mayZeb$`Gene stable ID`)


tmp <- left_join(
  df_all_peptide_mayZeb %>% 
    filter(`Gene stable ID` %in% df_private_mayZeb$gene) %>% 
    select(peptide_id = `Protein stable ID`, gene_id = `Gene stable ID`) %>% 
    distinct(),
  df_all_genes_mayZeb %>% select(gene_id, gene_chr=seqnames )
)
tmp$gene_chr <- as.vector(tmp$gene_chr)
tmp <- left_join(tmp,
                 df_tblastn_mayZeb %>% select(peptide_id = query, subject_chr, perc_identity, evalue)) %>% 
  mutate(subject_chr_parsed = glue("LG{subject_chr}"))

glue("number of private genes: {length(unique(tmp$gene_id))}")
glue(
  "number of private genes not on scaffojlds: {x}",
  x = tmp %>% filter(startsWith(gene_chr, "LG")) %>% .$gene_id %>% unique() %>% length()
)

# not on scaffold, match the exact same chromosome, 80% identity
tmp %>% 
  filter(startsWith(gene_chr, "LG")) %>% 
  filter(gene_chr == subject_chr_parsed, perc_identity > 80) %>% 
  .$gene_id %>% unique() %>% length()

# not on scaffold, match the exact same chromosome, 80% identity
tmp %>% 
  filter(startsWith(gene_chr, "LG")) %>% 
  filter(perc_identity > 80) %>% 
  .$gene_id %>% unique() %>% length()

