library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(glue)
library(scales)

theme_set(theme_classic())

color_schema_species <- readr::read_csv(
  paste0(Sys.getenv("HOME"), "/code/malawi_transposon/metadata/species_colorscheme.csv"), comment='#') %>% 
  mutate(color = paste0('#', color))

df_raw <- read_csv(snakemake@input[[1]])
                   
################################################################################

# summarise segment information to bubble level (THIS CAN TAKE A WHILE TO RUN)
df_bubble <- df_raw %>% 
  filter(core_bool == FALSE) %>% 
  separate_rows(bubble_id, sep=',') %>% 
  distinct() %>% 
  group_by(species1, species2, bubble_id) %>% 
  summarise(count = n(), maxlen = max(length), sumlen = sum(length)) %>% 
  ungroup() %>% 
  mutate(species1 = forcats::fct_relevel(species1, color_schema_species$species_abrv),
         species2 = forcats::fct_relevel(species2, color_schema_species$species_abrv))

################################################################################

# BUBBLE LEVEL STATISTICS

# compute various summary statistics about the bubbles
df_bubble %>% 
  group_by(species1, species2) %>% 
  summarise(n_bubble = n(), mean_bubble_size = mean(sumlen), total_bubble_size = sum(sumlen)) %>% 
  write_delim(snakemake@output$stats, delim='\t')

# heatmap of histograms
p <- df_bubble %>% 
  ggplot(aes(sumlen)) +
  geom_histogram(bins=20) +
  facet_grid(forcats::fct_rev(species2)~species1) +
  coord_cartesian(xlim=c(NA,10**5)) +
  scale_x_log10("Total length of flexible segments by bubble",
    breaks = trans_breaks("log10", function(x) 10^x),
    labels = trans_format("log10", math_format(10^.x)) ) +
  ylab("Number of bubbles") +
  theme_gray() +
  theme(aspect.ratio=1)
ggsave(snakemake@output$hist, width = 7.8, height = 7.8)

################################################################################

# AGGREGATED STATISTICS

# bin bubble sizes and calculate various statistics
df_bubble_agg <- df_bubble %>% 
  group_by(species1, species2) %>% 
  mutate(bubbleBin = ntile(log(sumlen), 100)) %>% 
  group_by(species1, species2, bubbleBin) %>% 
  summarise(count = n(), totalBubbleSize = sum(sumlen), maxBubbleSize = max(sumlen)) %>% 
  mutate(cumCount = cumsum(count), cumBubbleSize = cumsum(totalBubbleSize))

# cumulative count of bubbles
p <- df_bubble_agg %>% 
  ggplot(aes(x=log10(maxBubbleSize), y=cumCount, color=species2)) +
  geom_line() +
  facet_wrap(.~species1, nrow=2) +
  xlab("Size of bubble (log10)") +
  ylab("Cumulative number of detected bubbles") +
  scale_color_manual("species aligned to backbone", values=color_schema_species$color, 
                     breaks=color_schema_species$species_abrv, labels=color_schema_species$species_abrv) +
  theme(legend.position='top', 
        legend.title = element_text(size = rel(0.75)),
        legend.text = element_text(size = rel(0.65)),
        legend.key.size=unit(0.04, "npc")
  )
ggsave(snakemake@output$cum_count, width=6.4, height=4)

# cumulative count of total bases in bubbles
# this isn't too useful because y is quadratic with respect to x, hard to interpret
df_bubble_agg %>% 
  ggplot(aes(x=log10(maxBubbleSize), y=cumBubbleSize, color=species2)) +
  geom_line() +
  facet_wrap(.~species1, nrow=2) +
  xlab("Size of bubble (log10)") +
  ylab("Cumulative length of detected bubbles") +
  scale_color_manual("species aligned to backbone", values=color_schema_species$color, 
    breaks=color_schema_species$species_abrv, labels=color_schema_species$species_abrv) +
  theme(legend.position='top',
        legend.title = element_text(size = rel(0.75)),
        legend.text = element_text(size = rel(0.65)),
        legend.key.size=unit(0.04, "npc")
  )
ggsave(snakemake@output$cum_len, width=6.4, height=4)
