# this script converts TxDB objects for calliptera and zebra genome annotations into a BED format
# which is needed for some ODGI analysis

library(GenomicFeatures)
library(AnnotationDbi)
library(dplyr)
library(readr)
library(glue)

for (species in c("astCal_v1.2", "mayZeb_v2.0")) {
  setwd(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/cloud/genome/", gsub("_v", "", species), "_ensembl/") )
  txdb <- AnnotationDbi::loadDb(file = paste0(species, ".gff.sqlite") )
  
  # remove two mitochondrial contigs from TXDB
  contigs_to_remove <- c("MT", "AGTA05001689.1")
  
  ##########
  # extract gene coordinates
  gr_gene <- genes(txdb, columns = c('gene_id', 'tx_name'))
  df_gene <- gr_gene %>% 
    as.data.frame() %>% 
    as_tibble() %>% 
    rename(chr = seqnames) %>% 
    select(chr, start, end, gene_id, width, strand)
  
  # write gene BED file
  # in other 0-based index programs that are not R, the start is not counted, so must -1
  df_gene %>% 
    mutate(start = start - 1) %>% 
    filter(!chr %in% contigs_to_remove) %>% 
    arrange(chr, start) %>% 
    write_delim("txdb_gene.bed", delim='\t', col_names = FALSE)

  ##########
  # extract exon coordinates (DEPRECATED: replaced by EXONIC PARTS BELOW)
  # ISSUE: different transcripts of the same gene
  # may have the same exon coordinates but different exon identifiers
  # thus, when grouped by gene, some exons are repeated multiple times
  
  gr_exon <- exons(txdb, columns = c('exon_name', "GENEID", "TXNAME"))
  df_exon <- gr_exon %>% 
    as.data.frame() %>% 
    as_tibble() %>% 
    rename(chr = seqnames) %>% 
    select(chr, start, end, exon_name, width, strand)
  
  # write exon BED file
  df_exon %>% 
    mutate(start = start - 1) %>% 
    filter(!chr %in% contigs_to_remove) %>% 
    arrange(chr, start) %>% 
    write_delim("txdb_exon.bed", delim='\t', col_names = FALSE)
  
  # get gene to exon and exon length mapping (to calculate groupby PAV percent later)
  x <- lapply(exonsBy(txdb, by='gene'), function(x) paste(x$exon_name, collapse=',') )
  df_exon_to_gene <- tibble(gene = names(x), exon = x) %>% 
    tidyr::separate_rows(exon, sep=',') %>% 
    left_join(
      as.data.frame(gr_exon) %>% as_tibble() %>% select(exon = exon_name, width)
    )
  write_csv(df_exon_to_gene, "txdb_exon_to_gene_mapping.txt")
  
  ##########
  # extract exonic parts (disjoint, non-overlapping exonic regions)
  gr_exonicParts <- exonicParts(txdb)
  df_exonic_parts <- gr_exonicParts %>% 
    as.data.frame() %>% 
    as_tibble() %>% 
    rename(chr = seqnames) %>% 
    select(chr, start, end, width, strand, gene_id)
  
  df_exonic_parts$gene_id <- sapply(
    df_exonic_parts$gene_id, function(x) paste(unlist(x), collapse=",")
  )
  
  # write exonic parts BED file
  df_exonic_parts %>% 
    select(chr, start, end, gene_id, width, strand) %>% 
    mutate(start = start - 1) %>% 
    filter(!chr %in% contigs_to_remove) %>% 
    arrange(chr, start) %>% 
    write_delim("txdb_exonic_parts.bed", delim='\t', col_names = FALSE)
  
  ##########
  # extract coordinates upstream of gene (-2000 to +500 of gene start)
  gr_upstream <- flank(gr_gene, width=2000, start=TRUE) %>% GenomicRanges::trim()
  df_upstream <- gr_upstream %>% 
    as.data.frame() %>% 
    as_tibble() %>% 
    rename(chr = seqnames) %>% 
    select(chr, start, end, gene_id, width, strand)
  
  # write file
  df_upstream %>% 
    mutate(start = start - 1) %>% 
    filter(!chr %in% contigs_to_remove, width > 0) %>% 
    arrange(chr, start) %>% 
    write_delim("txdb_upstream.bed", delim='\t', col_names = FALSE)
  
  ##########
  # extract coordinates downstream of gene (-500 to +2000 of gene end)
  gr_downstream <- flank(gr_gene, width=2000, start=FALSE) %>% GenomicRanges::trim()
  df_downstream <- gr_downstream %>% 
    as.data.frame() %>% 
    as_tibble() %>% 
    rename(chr = seqnames) %>% 
    select(chr, start, end, gene_id, width, strand)
  
  # write file
  df_downstream %>% 
    mutate(start = start - 1) %>% 
    filter(!chr %in% contigs_to_remove, width > 0) %>% 
    arrange(chr, start) %>% 
    write_delim("txdb_downstream.bed", delim='\t', col_names = FALSE)
  
  ##########
  # INTRONS
  # I define a gene's introns to be:
  # any part of the gene that is not an exon of itself, or of other genes
  # this means that there might be common intron "bits" between genes, so some intervals are repeated
  # however, since we're will be calculating the PAV at the level of individual gene, this is fine
  
  gr_intronicParts_raw <- GenomicRanges::setdiff(gr_gene, gr_exonicParts, ignore.strand=TRUE)
  hits <- findOverlaps(gr_intronicParts_raw, gr_gene, ignore.strand=TRUE)
  
  gr_intronicParts <- gr_intronicParts_raw[queryHits(hits)]
  gr_intronicParts$gene_id <- gr_gene[subjectHits(hits)]$gene_id
  
  df_intronicParts <- gr_intronicParts %>% 
    as.data.frame() %>% 
    as_tibble() %>% 
    rename(chr = seqnames) %>% 
    select(chr, start, end, gene_id, width, strand)
  
  # write file
  df_intronicParts %>% 
    mutate(start = start - 1) %>% 
    arrange(chr, start) %>% 
    filter(!chr %in% contigs_to_remove) %>% 
    arrange(chr, start) %>% 
    write_delim("txdb_intronic_parts.bed", delim='\t', col_names = FALSE)
  
}
