# This script takes a BED file of "missing regions" as input and plots them as bands
# in the output plot, each black band represents a missing region, the X position is the genomic position
# from the bed file and the y position is the scaffold ID, assuming that scaffold names follow the pattern
# [^_]+_[0-9]+
# cmdoret, 20220501

library(tidyverse)
df <- read_tsv('C3_specific_regions.bed', col_names=c('scf', 'start', 'end', 'scf_neff', 'qual', 'strand'))
# Total C3-specifc-DNA per scaffold
df %>%
  group_by(scf) %>%
  summarize(tot=sum(end-start)) %>%
  arrange(-tot)

df <- df %>%
  mutate(scf_no=as.numeric(str_split(scf, '_', simplify=T)[, 2]))
df <- df %>% filter(scf_no < 50)
ggplot(
    data=df %>% arrange(scf_no),
    aes(xmin=start, xmax=end, ymin=scf_no-0.1, ymax=scf_no+0.1)
  ) +
  geom_rect() +
  theme_minimal() +
  ggtitle('A. castellanii str. C3 regions with no mapping in Neff\n(only 50 largest scaffolds shown)') +
  xlab('Genomic position in C3') +
  ylab('Scaffold number')
