#library(AnnotationDbi)
#ibrary(GenomicRanges)
library(scales)
library(factoextra)
library(ggplot2)
library(tibble)
library(dplyr)
library(tidyr)
library(readr)
library(glue)
#library(reactable)

theme_set(theme_classic())

setwd(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/"))

source("modules/frequently_used_items.R")
#source("modules/get_chrom_sizes.R")
#source("modules/generic_granges_functions.R")
#source("modules/pseudogenome_functions.R")

##########
subdir <- "malawi_haplochromines"
bb_name <- "astCal1.2_ensembl"
bb_name_sqlite <- "astCal_v1.2"
bb_name_abrv <- "astCal"


path_to_intersect <- "/Users/pittachalk/code/malawi_transposon/script/te_permutation_shuffle/guide/astCal-intersect–te_as_whole.bed"
path_to_te <- "/Users/pittachalk/code/malawi_transposon/script/te_permutation_shuffle/guide/transposon.simple"
path_to_consensus_sizes <- "/Users/pittachalk/storage/malawi_transposon/cloud/repeatmodeler/calliptera_repbase_jun21/astCal1.2_ensembl-families.fa.fai"

df_intersect <- readr::read_delim(
    path_to_intersect, 
    delim='\t', col_types='ciicd_ciici_i',
    col_names=unlist(strsplit("te_chr,te_start,te_end,te_id,te_age,sv_chr,sv_start,sv_end,sv_id,sv_len,overlap_len", ','))
  ) %>% 
  separate_wider_delim(te_id, delim=':', names=c('te_class','te_subclass','te_id','repmask_id') ) %>% 
  filter(!te_class %in% c('Simple_repeat', 'Low_complexity', 'tRNA', 'snRNA', 'rRNA', 'Satellite')) %>% 
  mutate(te_class = if_else(te_class == 'RC', 'Helitron', te_class)) %>% 
  mutate(te_class = if_else(te_class == 'SINE?', 'SINE', te_class))
  
df_te <- readr::read_delim(path_to_te, delim='\t', col_types='cdciiccccccc') %>% 
  mutate(repeat_class_broad = gsub(" ", "", repeat_class_broad)) %>% 
  filter(!repeat_class_broad %in% c('Simple_repeat', 'Low_complexity', 'tRNA', 'snRNA', 'rRNA', 'Satellite')) %>% 
  mutate(repeat_class_broad = if_else(repeat_class_broad == 'RC', 'Helitron', repeat_class_broad)) %>% 
  mutate(repeat_class_broad = if_else(repeat_class_broad == 'SINE?', 'SINE', repeat_class_broad)) %>% 
  arrange(chr, start, end)

df_consensus_sizes <- readr::read_delim(
  path_to_consensus_sizes, delim='\t', col_type='ci___', col_names=c('repmask_id','consensus_length') ) %>% 
  mutate(repmask_id = gsub("#.+", "", repmask_id) )



inner_join(select(df_te, te_id=id, repmask_id), df_consensus_sizes)

df_merged <- df_intersect %>% 
  left_join(df_consensus_sizes) %>% 
  mutate(te_len = te_end - te_start) %>% 
  mutate(
    overlap_perc_te = overlap_len / te_len,  # TE fragment inside the intersect
    intactness = te_len / consensus_length
  )


df_merged %>% 
  filter(intactness <= 1) %>% 
  ggplot(aes(x = intactness, fill = overlap_perc_te > 0.9)) +
  geom_histogram()

df_merged %>% 
  filter(intactness <= 1) %>% 
  ggplot(aes(x = intactness, fill = overlap_perc_te > 0.9)) +
  geom_histogram()

ggplot(df_merged, aes(te_len, fill = overlap_perc_te > 0.9)) +
  geom_histogram(position = 'identity')

ggplot(df_merged, aes(te_age, fill = overlap_perc_te > 0.9)) +
  geom_histogram(position = 'identity')

ggplot(df_merged, aes(te_age, te_len, fill = overlap_perc_te > 0.9)) +
  stat_summary_bin(geom='bar', fun='sum', position='stack', breaks = seq(0, 49, 0.5))


# this is meant to be removed
gr_intersect_list$astCal %>% 
  as.data.frame() %>% 
  filter(repmask_id != "rnd-1_family-37") %>% 
  left_join(df_pseudo_sizes, by="species") %>% 
  ggplot(aes(te_score+0.01, width / flex_size * 100, fill=te_class)) +
  stat_summary_bin(geom='bar', fun='sum', position='stack', breaks = seq(0, 49, 0.5)) +
  scale_fill_manual('Transposon Class', values = color_schema_te$color, breaks = color_schema_te$te_family) +
  scale_x_continuous(sec.axis = sec_axis(
    ~ . / 100 * generation_time / mu / 1e6,
    breaks = c(0.9, 9.2, 20, 62.1, 87.3, 150),
    labels = c(
      "Onset of Lake Malawi radiation (0.9 mya)",
      "Haplochromine-Lamprologni lineage split (9.2 mya)",
      "Split from Oreochromis (20.0 mya)",
      "East Africa cichlids split from Cichlidae (62.1 mya)",
      "Cichlidae split from Percomorphaceae (87.3 mya)",
      "Gondwana supercontinent begins to breakup (150 mya)") )) +
  geom_vline(xintercept = c(0.11, 1.07, 2.33, 7.245, 10.185, 17.5), linetype="dotted") +
  xlab("Divergence from ancestral consensus (%)") + 
  ylab("Percentage sequence of flexible region") +
  coord_cartesian(ylim=c(0, NA), xlim = c(0, 40)) +
  theme(axis.text.x.top = element_text(angle = 45, hjust = 0),
        legend.position = c(0.9, 0.6))
