library(AnnotationDbi)
library(GenomicRanges)
library(ggplot2)
library(tibble)
library(dplyr)
library(tidyr)
library(readr)
library(glue)

setwd(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/"))

source("modules/get_chrom_sizes.R")
source("modules/misannotation_thresholds.R")

################################################################################
# COMMENT OUT AS NECESSARY

# astCal version
subdir <- "malawi_haplochromines"
bb_name <- "astCal1.2_ensembl"
bb_name_sqlite <- "astCal_v1.2"
bb_name_abrv <- "astCal"

# mayZeb version
subdir <- "malawi_haplochromines_zebra"
bb_name <- "mayZeb2.0_ensembl"
bb_name_sqlite <- "mayZeb_v2.0"
bb_name_abrv <- "mayZeb"

################################################################################

# read data frame for species coverage across segments
df_coverage_by_species <- read_csv(
  glue(Sys.getenv("HOME"), 
       "/code/malawi_transposon/cloud/graph_genome/{subdir}/pseudoreference/pav/graph_bbsegments_coverage_by_species.csv"),
  col_types = 'cciildddddddd')

# get gene GRanges
path_to_annotation_sqlite <- glue(
  Sys.getenv("HOME"), "/igv/genomes/{bb_name}/{bb_name_sqlite}.gff.sqlite")
txdb <- AnnotationDbi::loadDb(file = path_to_annotation_sqlite)
gr_gene <- genes(txdb, columns = c('gene_id', 'tx_name'))

# add flanking regions to the genes
gr_gene_flanked <- resize(gr_gene, width = width(gr_gene) + 50, fix="start") %>% resize(width = width(.) + 50, fix="end") %>% trim()


# for each sample, we extract a list of genes where it had good coverage
# putting into a data frame
df_out <- tibble()
for (sample_name in color_schema_species$species_abrv) {
  # create GRanges for covered segments in the current species
  tmp_df <- df_coverage_by_species %>% filter(!!as.name(sample_name) >= threshMinCoverageAcrossSpecies)
  gr_covered <- GenomicRanges::GRanges(
    seqnames = tmp_df$chr,
    ranges = IRanges(start = tmp_df$start, end = tmp_df$end),
    segment = tmp_df$segment,
    seqinfo = Seqinfo(chrom_sizes_list[[bb_name_abrv]][[1]], chrom_sizes_list[[bb_name_abrv]][[2]]) ) %>% 
    GenomicRanges::trim()
  
  # overlap gene with covered segments
  hits <- findOverlaps(gr_gene_flanked, gr_covered, ignore.strand=TRUE)
  overlaps <- pintersect(gr_gene_flanked[queryHits(hits)], gr_covered[subjectHits(hits)], ignore.strand=TRUE)
  
  # turn into dataframe to do a grouped by sum of coverage
  df_percentGeneOverlap_coverage <- tibble(
    gene = gr_gene_flanked[queryHits(hits)]$gene_id,
    uncovered_range_idx = subjectHits(hits),
    percentGeneOverlap = width(overlaps) / width(gr_gene_flanked[queryHits(hits)]) ) %>% 
    group_by(gene) %>% 
    summarise(percentGeneOverlap = sum(percentGeneOverlap))
  
  # filter through genes that overlap more than that threshold
  geneList_goodCoverage <- df_percentGeneOverlap_coverage %>% 
    filter(percentGeneOverlap > threshGeneBodyOverlapHighCovSegments) %>% 
    .$gene %>% unique()
  
  print(glue(
    length(geneList_goodCoverage), " ({percent_covered}%) of genes are covered in {sample_name}",
    percent_covered = round(length(geneList_goodCoverage) / length(gr_gene) * 100, 2))
  )
  
  # put into dataframe
  df_out <- bind_rows(df_out, tibble(sample = sample_name, gene = geneList_goodCoverage) )
}

# simple summary (still need to remove those in complex regions or are TEs)
df_out %>% group_by(gene) %>% summarise(counted = n()) %>% .$counted %>% table()

# write results
df_out %>% write_csv(glue("./local/segment_coverage_by_species-{bb_name_abrv}.csv"))
