library(ggplot2)
library(tibble)
library(dplyr)
library(tidyr)
library(readr)
library(glue)

setwd(paste0(Sys.getenv("HOME"), "/code/malawi_transposon/script/R_SCRIPT/"))

df_sv <- read_csv(
  "./local/malawi_haplochromines-summary-preprocessed.csv",
  col_types = paste0('cciiiiicciiicilliidc', paste0(rep('ciicic', 8), collapse=''), collapse=''),
  na = c("", "NA", ".")
)


tmp_df_complex <- df_sv %>% 
  filter(n_paths > 1000, n_uncalled == 0) %>% 
  select(
    id, n_segments, n_paths, n_paths_bio, len_min, len_max,
    ends_with("_len")
  )

View(tmp_df_complex %>% filter(len_min == 0))


# Simpler bubble with 1027 (about 2^10) paths --- more traceable for illustration
df_sv[df_sv$id == "14:27902095-27923959",][['segment_list']]

# also not many paths, but too many small variation --- one part very hard to trace
df_sv[df_sv$id == "13:14108819-14112783",][['segment_list']]  # mostly small variation

# quite comprehesible: one simple allele + a nested one
df_sv[df_sv$id == "7:38163562-38266473",][['segment_list']]

# another nice, complex case: where there is a 0 allele as well, but less comprehensible
df_sv[df_sv$id == "22:27388575-27469305",][['segment_list']]

# this one seems real, has 2**31 theoretical paths --- two fairly complex main alleles
df_sv[df_sv$id == "3:24780363-24881560",][['segment_list']]


##########################################################################################

# CHECKING OUT SOME GENES

# fhl2b gene
df_sv %>% 
  filter(chr == '23', start > 26838000, end < 26847100) %>% 
  select(id, n_paths_bio, ends_with('_len'))

# rx1 gene
df_sv %>% 
  filter(chr == '23', start > 1403000, end < 1414100) %>% 
  select(id, n_paths_bio, ends_with('_len'))

tmp_df
sapply(filter(df_sv, chr == '23', start > 1403000, end < 1414100) %>% .$segment_list,
       function(x) strsplit(x, ',')) %>% unlist() %>% paste(collapse = ',') 


##########################################################################################

# mitfa: first intron
df_sv %>% 
  filter(chr == '5', start > 30887700, end < 30898000) %>% 
  select(id, n_paths_bio, ends_with('_len'))

# mitfa: second intron & beyond
df_sv %>% 
  filter(chr == '5', start > 30898900, end < 30912500) %>% 
  select(id, n_paths_bio, ends_with('_len'))

# segments for this gene (for broader visualisation)
df_mitfa <- df_sv %>% filter(chr == '5', start > 30886000, end < 30918000)

sapply(df_mitfa$segment_list, function(x) strsplit(x, ',')) %>% unlist() %>% paste(collapse = ',')

