setwd(paste0(Sys.getenv('HOME'), "/code/malawi_transposon/script/pcr_mapping_validation/") )

library(readr)
library(dplyr)

df_sv <- read_csv(
  paste0(Sys.getenv('HOME'), "/code/malawi_transposon/local/script/malawi_haplochromines-summary-preprocessed.csv"),
  col_types = paste0('cciiiiicciiicilliidc', paste0(rep('ciicic', 8), collapse=''), collapse=''),
  na = c("", "NA", ".")
)

# function to convert the allele path into something more readable
extract_gfa_path <- function(x) {
  segment_list <- stringr::str_extract_all(x, "s[0-9]+") %>% 
    unlist() %>% 
    gsub("s", "", .)
  
  direction_list <- stringr::str_extract_all(x, "[><]") %>% 
    unlist() %>% 
    gsub(">", "+", .) %>% 
    gsub("<", "-", .)
  
  gfa_path <- paste(paste0(segment_list, direction_list), collapse=',')
  if (gfa_path == "") {
    gfa_path <- "*"
  }
   
  return(gfa_path)
}

df_out <- df_sv %>% 
  select(bubble_id = id, ends_with('_path')) %>% 
  mutate(across(ends_with('_path'), Vectorize(extract_gfa_path)))

df_out %>% write_delim("./local/alleles_by_bubbles.txt", delim='\t')
