Correlation between different sequencing methods
Using insertions per gene
get_ipg = function(x, ver){
data = read_tsv(paste0(data_dir, "/hpc_output/",ver, "/", x),
col_types = cols(CDS=col_character(),`#insertion` = col_double())) %>%
mutate(sample = str_sub(x, 1, -22), gene=NA,
run=ver)
}
ipg_forcor = NULL
for(i in seq_runs) {
ipg_files = list.files(path = paste0(data_dir, "/hpc_output/", i, "/"), pattern = '*insertionPerGene.txt')
ipg_t = map(ipg_files, get_ipg, ver=i)
ipg_df_t = do.call(rbind, ipg_t)
ipg_forcor = ipg_forcor %>%
bind_rows(ipg_df_t %>% unite(sample_rep, sample, run))
}
# get correlations
ipg_forcor = ipg_forcor %>%
mutate(strain = str_sub(sample_rep, 1, -6),
rep = str_sub(sample_rep, -4, -1)) %>%
left_join(strain_names) %>%
unite(sample_rep, strain_names, rep) %>%
dplyr::select(CDS, sample_rep, `#insertion`) %>%
pivot_wider(names_from = sample_rep, values_from = `#insertion`)
reps = ipg_forcor %>%
dplyr::select(eu_1_nyc1, eu_1_nyc2, eu_2_nyc1, eu_2_nyc2)
cor_plots=GGally::ggpairs(reps,
upper = list(continuous = GGally::wrap("cor", method = "pearson"))) +
ggtitle(LETTERS[1])
ggsave(cor_plots, file=paste0(fig_dir, "/SFigX", LETTERS[1], ".pdf"), height = 10, width = 10)
i=2
for(strain in c("aneu", "trip1", "trip2", "trip3", "trip4", "iso", "quad")) {
reps = ipg_forcor %>%
dplyr::select(colnames(ipg_forcor)[str_detect(colnames(ipg_forcor), strain)])
cor_plots=GGally::ggpairs(reps,
upper = list(continuous = GGally::wrap("cor", method = "pearson"))) +
ggtitle(LETTERS[i])
ggsave(cor_plots, file=paste0(fig_dir, "/SFigX", LETTERS[i], ".pdf"), height = 10, width = 10)
i=i+1
}
get_rpp = function(x, ver) {
filepath=paste0(data_dir,"/hpc_output/",ver,"/",x)
out = read_tsv(filepath, col_names = F, col_types = cols()) %>%
dplyr::rename(chromosome=X1, chr_pos=X2, reads=X3) %>%
mutate(sample = str_sub(x, 1, -16),
version = ver)
}
files = list.files(path = paste0(data_dir, '/hpc_output/combined/'),
pattern = '*readPerPos.txt')
read_per_pos = map(files, get_rpp, ver='combined')
rpp_df = do.call(rbind, read_per_pos)
summary_rpp = rpp_df %>%
left_join(strain_names, by = c("sample" = "strain")) %>%
group_by_at(vars(strain_names, sample)) %>%
rename(Gresham_ID = sample) %>%
summarize(total_sites = n(), min_rpp = min(reads), max_rpp = max(reads),
mean_rpp = mean(reads), median_rpp = median(reads))
summary_rpp %>%
knitr::kable() %>%
kableExtra::kable_styling()
write_csv(summary_rpp, paste0(fig_dir, "/SupplementaryTable1.csv"))
# number of insertion sites scales with total reads generated
reads_per_lib = read_csv(paste0(data_dir,"/total_reads_per_library.csv"))
sfig2 = reads_per_lib %>%
left_join(summary_rpp, by = c("Sample"="Gresham_ID")) %>%
ggplot(aes(total_sites, reads, color = strain_names)) +
geom_point(size=2) +
scale_color_manual(values = strain_cols) +
xlab("Unique insertion sites") +
ylab("Total reads ") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels = scales::comma) +
theme(legend.title=element_blank())
ggsave(paste0(fig_dir,"/SFig2.pdf"), plot = sfig2, width = 8.5, height = 5, units = "in")
ggsave(paste0(fig_dir,"/SFig2.png"), plot = sfig2, width = 8.5, height = 5, units = "in")
#Supplemental_Fig_S9
# essential genes have fewer insertions
get_cds_promoter = function(x, y){
y = y %>% dplyr::filter(str_starts(X9, paste0(x, ';')))
if(nrow(y) > 1) {
y$X4 = min(y$X4)
y$X5 = max(y$X5)
}
chromosome = y$X1
strand = y$X7
if(strand[1] == '+') {
start = y$X4
stop = y$X5
promoter = y$X4 - 200
} else {
start = y$X5
stop = y$X4
promoter = y$X5 + 200
}
return(tibble(chromosome, start, stop, promoter, strand)[1,])
}
# x is a gene that I want to get info for (in "ID=cds0" form), inserts is the reads per position file
# will return for each sample in inserts
get_binned_unique_insertions = function(x, inserts){
y = get_cds_promoter(x, y = gff_cds)
bin_size_cds = (y$start - y$stop)/100
cds=NULL
promo=NULL
for(i in unique(inserts$sample)) {
bin_fill_cds = NULL
bin_fill_prom = NULL
ini = inserts %>% dplyr::filter(sample == i, chromosome == y$chromosome)
if(y$strand == '+') {
for(j in 1:100) {
bin_fill_cds[j] = nrow(ini %>%
dplyr::filter(chr_pos >= y$start+bin_size_cds*(j-1) &
chr_pos < y$start+bin_size_cds*j &
chr_pos <= y$stop))
bin_fill_prom[j] = nrow(ini %>% dplyr::filter(chr_pos >= y$promoter+2*(j-1) &
chr_pos <= y$promoter+2*j &
chr_pos < y$start))
}
} else {
for(j in 1:100) {
bin_fill_cds[j] = nrow(ini %>%
dplyr::filter(chr_pos <= y$start-bin_size_cds*(j-1) &
chr_pos > y$start-bin_size_cds*j &
chr_pos >= y$stop))
bin_fill_prom[j] = nrow(ini %>% dplyr::filter(chr_pos <= y$promoter-2*(j-1) &
chr_pos >= y$promoter-2*j &
chr_pos > y$start))
}
}
names(bin_fill_cds) = 1:100
names(bin_fill_prom) = 1:100
cds = cds %>% bind_rows(c(type = "cds", id=x, sample=i, bin_fill_cds))
promo = promo %>% bind_rows(c(type= "promoter",id=x, sample=i, bin_fill_prom))
}
bind_rows(cds, promo)
}
binned_inserts = do.call(rbind, map(all_genes$X1, get_binned_unique_insertions, inserts = rpp_df))
#write_csv(binned_inserts, "./binned_inserts_temp.csv")
#read_csv("./binned_inserts_temp.csv")
yeast_r64_to_systematic <- function(name_vec) {
translated_names <- match(name_vec, labtools::yeast_gene_names$GCF_000146045.2_R64_genomic_ID, nomatch=NA)
translated_names <- labtools::yeast_gene_names[translated_names, "Systematic_name"]
no_translation <- is.na(translated_names)
translated_names[no_translation] <- name_vec[no_translation]
return(translated_names)
}
t$gene = yeast_r64_to_systematic(t$id)
# get list of essential genes from Winzeler 1999
ess_del = read_tsv(paste0(data_dir,'/Essential_ORFs.txt'), col_names = T, comment = '=', col_types = cols(
rec_num = col_double(),
ORF_name = col_character(),
deletion_alias = col_character(),
gene_names = col_character(),
UPTAG_sequence_20mer = col_character(),
DNTAG_sequence_20mer = col_character()
)) %>%
dplyr::select(ORF_name) %>% dplyr::rename(gene=ORF_name) %>%
mutate(ess_del = "yes")
# get gene fitness in ypgal from costanzo et al 2021
fit_gal = read_csv(paste0(data_dir, "/Costanzo_Mutant Fitness_Conditions-Table 1.csv"), col_names = T) %>%
select(`Systematic Name`, `Gene Name`, `Allele (Essential genes only)`, `Galactose`) %>%
mutate(quartile = case_when(Galactose <= quantile(Galactose, na.rm=T)[2] ~ "Q1",
Galactose <= quantile(Galactose, na.rm=T)[3] ~ "Q2",
Galactose <= quantile(Galactose, na.rm=T)[4] ~ "Q3",
Galactose <= quantile(Galactose, na.rm=T)[5] ~ "Q4",))
binned_inserts = binned_inserts %>%
mutate(essential = dplyr::if_else(`gene` %in% ess_del$gene, 'yes', 'no')) %>%
left_join(fit_gal %>% select(`Systematic Name`, quartile), by=c("gene" = "Systematic Name")) %>%
pivot_longer(cols=c(-gene, -essential, -type, -id, -sample),
names_to = "bin", values_to = "inserts_per_bin") %>%
mutate_at(c('bin', 'inserts_per_bin'), as.numeric)
meta_bin_insert_essential = binned_inserts %>%
group_by_at(vars(sample, type, essential, bin)) %>%
mutate(total_inserts_bin = sum(inserts_per_bin),
mean_inserts_bin = mean(inserts_per_bin),
median_inserts_bin = median(inserts_per_bin),
ngene_norm_total_inserts_bin = dplyr::if_else(essential == 'yes', sum(inserts_per_bin)/nrow(ess_del), sum(inserts_per_bin)/(nrow(all_genes) - nrow(ess_del)))) %>%
dplyr::select(sample, type, essential, bin, total_inserts_bin, mean_inserts_bin, median_inserts_bin, ngene_norm_total_inserts_bin) %>% distinct()
meta_bin_insert_essential$type_order = factor(meta_bin_insert_essential$type,
levels=c('promoter', 'cds'))
sfig3a = ggplot(meta_bin_insert_essential %>%
filter(type == 'cds') %>%
mutate(strain_names_rep = case_when(sample == "1657_1" ~ "eu_1",
sample == "1657_2" ~ "eu_2",
sample == "1728" ~ "aneu",
sample == "1734" ~ "trip1",
sample == "1747" ~ "trip2",
sample == "1751" ~ "trip3",
sample == "1736" ~ "trip4",
sample == "1744" ~ "iso",
sample == "1740" ~ "quad"
)) ,
aes(bin, ngene_norm_total_inserts_bin, color = essential)) +
geom_line() +
#theme_minimal() +
facet_wrap(~strain_names_rep) +
scale_color_manual(values=c('#80b1d3', '#fb8072'),
name = "", labels = c("Non-essential", "Essential")) +
xlab('% of CDS') +
ylab('Mean unique insertion sites')
sfig3b=binned_inserts %>%
ungroup() %>%
filter(type=="cds") %>%
group_by_at(vars(sample, type, quartile, bin)) %>%
mutate(total_inserts_bin = sum(inserts_per_bin),
mean_inserts_bin = mean(inserts_per_bin),
median_inserts_bin = median(inserts_per_bin)) %>%
dplyr::select(sample, type, essential, bin, total_inserts_bin, mean_inserts_bin, median_inserts_bin, quartile) %>%
distinct() %>%
mutate(strain_names_rep = case_when(sample == "1657_1" ~ "eu_1",
sample == "1657_2" ~ "eu_2",
sample == "1728" ~ "aneu",
sample == "1734" ~ "trip1",
sample == "1747" ~ "trip2",
sample == "1751" ~ "trip3",
sample == "1736" ~ "trip4",
sample == "1744" ~ "iso",
sample == "1740" ~ "quad"
)) %>%
ggplot(aes(bin, mean_inserts_bin, color = quartile)) +
geom_line() +
#theme_minimal() +
facet_wrap(~strain_names_rep) +
#scale_color_manual(values=c('#80b1d3', '#fb8072'),
# name = "", labels = c("Non-essential", "Essential")) +
xlab('% of CDS') +
ylab('Mean unique insertion sites')
quantile(fit_gal$Galactose, na.rm=T)
layout <- "
A
B
"
sfig3=sfig3a + sfig3b + plot_layout(design = layout) + plot_annotation(tag_levels = 'A')
ggsave(paste0(fig_dir,"/SFig3.pdf"), plot = sfig3, width = 12, height = 16, units = "in")
ggsave(paste0(fig_dir,"/SFig3.png"), plot = sfig3, width = 12, height = 16, units = "in")
---
title: "Transposon mutagenesis in *GAP1* CNV strains"
author: "Grace Avecilla"
output: html_notebook
---


```{r}
library(EnhancedVolcano)
library(DESeq2)
library(org.Sc.sgd.db)
library(clusterProfiler)
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggpubr)
theme_set(theme_bw(base_size = 20))

data_dir = "/Volumes/GoogleDrive/My Drive/Gresham Lab_Grace/france_satay/Hermes_mutagenesis_paper/data"

fig_dir = "/Volumes/GoogleDrive/My Drive/Gresham Lab_Grace/france_satay/Hermes_mutagenesis_paper/figures"

source("/Volumes/GoogleDrive/My Drive/Gresham Lab_Grace/france_satay/Hermes_mutagenesis_paper/analysis_w_PS_corrections/functions.R")

seq_runs = c("bgi1","bgi2", "nyc1", "nyc2")
```

```{r}
# get all genes excluding dubious orfs and their copy number
strain_cns = read_csv(paste0(data_dir, "/gene_median_relative_depth_DNA_corrected_v3/gene_median_relative_depth_DNA-Table 1.csv")) %>% 
  select(Gene, contains("cor")) %>%
  pivot_longer(-Gene, names_to = "strain", values_to = "copy_number") %>%
  mutate(strain = str_extract(strain,"[0-9]{4}")) %>%
  left_join(strain_names)

all_genes = strain_cns$Gene
```

# Correlation between different sequencing methods
Using insertions per gene
```{r}
get_ipg = function(x, ver){
  data = read_tsv(paste0(data_dir, "/hpc_output/",ver, "/", x), 
                  col_types = cols(CDS=col_character(),`#insertion` = col_double())) %>%
    mutate(sample = str_sub(x, 1, -22), gene=NA,
           run=ver)
}
ipg_forcor = NULL
for(i in seq_runs) {
  ipg_files = list.files(path = paste0(data_dir, "/hpc_output/", i, "/"), pattern = '*insertionPerGene.txt')
  ipg_t = map(ipg_files, get_ipg, ver=i)
  ipg_df_t = do.call(rbind, ipg_t)
  ipg_forcor = ipg_forcor %>% 
    bind_rows(ipg_df_t %>% unite(sample_rep, sample, run))
}
```

```{r}
# get correlations
ipg_forcor = ipg_forcor %>%
  mutate(strain = str_sub(sample_rep, 1, -6),
         rep = str_sub(sample_rep, -4, -1)) %>%
  left_join(strain_names) %>%
  unite(sample_rep, strain_names, rep) %>%
  dplyr::select(CDS, sample_rep, `#insertion`) %>% 
  pivot_wider(names_from = sample_rep, values_from = `#insertion`) 


reps = ipg_forcor %>%
    dplyr::select(eu_1_nyc1, eu_1_nyc2, eu_2_nyc1, eu_2_nyc2)  
cor_plots=GGally::ggpairs(reps, 
                upper = list(continuous = GGally::wrap("cor", method = "pearson"))) +
    ggtitle(LETTERS[1])
ggsave(cor_plots, file=paste0(fig_dir, "/SFigX", LETTERS[1], ".pdf"), height = 10, width = 10)

i=2
for(strain in c("aneu", "trip1", "trip2", "trip3", "trip4", "iso", "quad")) {
  reps = ipg_forcor %>%
    dplyr::select(colnames(ipg_forcor)[str_detect(colnames(ipg_forcor), strain)])  
  cor_plots=GGally::ggpairs(reps, 
                upper = list(continuous = GGally::wrap("cor", method = "pearson"))) +
    ggtitle(LETTERS[i])
  ggsave(cor_plots, file=paste0(fig_dir, "/SFigX", LETTERS[i], ".pdf"), height = 10, width = 10)
  i=i+1
}

```

```{r get reads per position}
get_rpp = function(x, ver) {
  filepath=paste0(data_dir,"/hpc_output/",ver,"/",x)
  out = read_tsv(filepath, col_names = F, col_types = cols()) %>%
    dplyr::rename(chromosome=X1, chr_pos=X2, reads=X3) %>%
    mutate(sample = str_sub(x, 1, -16),
           version = ver)
}
files = list.files(path = paste0(data_dir, '/hpc_output/combined/'),
                     pattern = '*readPerPos.txt')
read_per_pos = map(files, get_rpp, ver='combined')
rpp_df = do.call(rbind, read_per_pos)

```

```{r supplementary table 1 library characteristics}
summary_rpp = rpp_df %>%
  left_join(strain_names, by = c("sample" = "strain")) %>%
  group_by_at(vars(strain_names, sample)) %>% 
  rename(Gresham_ID = sample) %>%
  summarize(total_sites = n(), min_rpp = min(reads), max_rpp = max(reads),
            mean_rpp = mean(reads), median_rpp = median(reads)) 
summary_rpp %>%
  knitr::kable() %>%
  kableExtra::kable_styling()
write_csv(summary_rpp, paste0(fig_dir, "/SupplementaryTable1.csv"))
```

```{r sfig2}
# number of insertion sites scales with total reads generated
reads_per_lib = read_csv(paste0(data_dir,"/total_reads_per_library.csv"))

sfig2 = reads_per_lib %>%
  left_join(summary_rpp, by = c("Sample"="Gresham_ID")) %>%
  ggplot(aes(total_sites, reads, color = strain_names)) +
  geom_point(size=2) +
  scale_color_manual(values = strain_cols) +
  xlab("Unique insertion sites") +
  ylab("Total reads ") +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(labels = scales::comma) +
  theme(legend.title=element_blank())

ggsave(paste0(fig_dir,"/SFig2.pdf"), plot = sfig2, width = 8.5, height = 5, units = "in")
ggsave(paste0(fig_dir,"/SFig2.png"), plot = sfig2, width = 8.5, height = 5, units = "in")
```


#Supplemental_Fig_S9

```{r Supplemental_Fig_S9}
# essential genes have fewer insertions
get_cds_promoter = function(x, y){
  y = y %>% dplyr::filter(str_starts(X9, paste0(x, ';')))
  if(nrow(y) > 1) {
    y$X4 = min(y$X4)
    y$X5 = max(y$X5)
  }
  chromosome = y$X1
  strand = y$X7
  if(strand[1] == '+') {
    start = y$X4
    stop = y$X5
    promoter = y$X4 - 200
  } else {
    start = y$X5
    stop = y$X4
    promoter = y$X5 + 200
  }
  return(tibble(chromosome, start, stop, promoter, strand)[1,])
}
# x is a gene that I want to get info for (in "ID=cds0" form), inserts is the reads per position file 
# will return for each sample in inserts
get_binned_unique_insertions = function(x, inserts){
  y = get_cds_promoter(x, y = gff_cds)
  bin_size_cds = (y$start - y$stop)/100
  cds=NULL
  promo=NULL
  for(i in unique(inserts$sample)) {
    bin_fill_cds = NULL
    bin_fill_prom = NULL
    ini = inserts %>% dplyr::filter(sample == i, chromosome == y$chromosome)
    if(y$strand == '+') {
      for(j in 1:100) {
        bin_fill_cds[j] = nrow(ini %>% 
                         dplyr::filter(chr_pos >= y$start+bin_size_cds*(j-1) &
                                         chr_pos < y$start+bin_size_cds*j & 
                                         chr_pos <= y$stop))
        bin_fill_prom[j] = nrow(ini %>% dplyr::filter(chr_pos >= y$promoter+2*(j-1) &
                                                        chr_pos <= y$promoter+2*j &
                                                        chr_pos < y$start))
      }
    } else {
      for(j in 1:100) {
        bin_fill_cds[j] = nrow(ini %>% 
                         dplyr::filter(chr_pos <= y$start-bin_size_cds*(j-1) &
                                         chr_pos > y$start-bin_size_cds*j & 
                                         chr_pos >= y$stop))
        bin_fill_prom[j] = nrow(ini %>% dplyr::filter(chr_pos <= y$promoter-2*(j-1) &
                                                        chr_pos >= y$promoter-2*j &
                                                        chr_pos > y$start))
      }
    }
    names(bin_fill_cds) = 1:100
    names(bin_fill_prom) = 1:100
    cds = cds %>% bind_rows(c(type = "cds", id=x, sample=i, bin_fill_cds))
    promo = promo %>% bind_rows(c(type= "promoter",id=x, sample=i, bin_fill_prom))
  }
  bind_rows(cds, promo)
}

binned_inserts = do.call(rbind, map(all_genes$X1, get_binned_unique_insertions, inserts = rpp_df))

#write_csv(binned_inserts, "./binned_inserts_temp.csv")
#read_csv("./binned_inserts_temp.csv")

yeast_r64_to_systematic <- function(name_vec) {
  translated_names <- match(name_vec, labtools::yeast_gene_names$GCF_000146045.2_R64_genomic_ID, nomatch=NA)
  translated_names <- labtools::yeast_gene_names[translated_names, "Systematic_name"]
  no_translation <- is.na(translated_names)
  translated_names[no_translation] <- name_vec[no_translation]
  return(translated_names)
}

t$gene = yeast_r64_to_systematic(t$id)

# get list of essential genes from Winzeler 1999
ess_del = read_tsv(paste0(data_dir,'/Essential_ORFs.txt'), col_names = T, comment = '=', col_types = cols(
  rec_num = col_double(),
  ORF_name = col_character(),
  deletion_alias = col_character(),
  gene_names = col_character(),
  UPTAG_sequence_20mer = col_character(),
  DNTAG_sequence_20mer = col_character()
)) %>%
  dplyr::select(ORF_name) %>% dplyr::rename(gene=ORF_name) %>%
  mutate(ess_del = "yes")

# get gene fitness in ypgal from costanzo et al 2021
fit_gal = read_csv(paste0(data_dir, "/Costanzo_Mutant Fitness_Conditions-Table 1.csv"), col_names = T) %>%
  select(`Systematic Name`, `Gene Name`, `Allele (Essential genes only)`, `Galactose`) %>%
  mutate(quartile = case_when(Galactose <= quantile(Galactose, na.rm=T)[2] ~ "Q1",
                              Galactose <= quantile(Galactose, na.rm=T)[3] ~ "Q2",
                              Galactose <= quantile(Galactose, na.rm=T)[4] ~ "Q3",
                              Galactose <= quantile(Galactose, na.rm=T)[5] ~ "Q4",))

binned_inserts = binned_inserts %>% 
  mutate(essential = dplyr::if_else(`gene` %in% ess_del$gene, 'yes', 'no')) %>% 
  left_join(fit_gal %>% select(`Systematic Name`, quartile), by=c("gene" = "Systematic Name")) %>%
  pivot_longer(cols=c(-gene, -essential, -type, -id, -sample), 
               names_to = "bin", values_to = "inserts_per_bin") %>%
  mutate_at(c('bin', 'inserts_per_bin'), as.numeric)


meta_bin_insert_essential = binned_inserts %>% 
  group_by_at(vars(sample, type, essential, bin)) %>%
  mutate(total_inserts_bin = sum(inserts_per_bin), 
         mean_inserts_bin = mean(inserts_per_bin),
         median_inserts_bin = median(inserts_per_bin),
         ngene_norm_total_inserts_bin = dplyr::if_else(essential == 'yes', sum(inserts_per_bin)/nrow(ess_del), sum(inserts_per_bin)/(nrow(all_genes) - nrow(ess_del)))) %>%
  dplyr::select(sample, type, essential, bin, total_inserts_bin, mean_inserts_bin, median_inserts_bin, ngene_norm_total_inserts_bin) %>% distinct()

meta_bin_insert_essential$type_order = factor(meta_bin_insert_essential$type,
                                    levels=c('promoter', 'cds'))

sfig3a = ggplot(meta_bin_insert_essential %>% 
         filter(type == 'cds') %>%
           mutate(strain_names_rep = case_when(sample == "1657_1" ~ "eu_1",
                                      sample == "1657_2" ~ "eu_2",
                                      sample == "1728" ~ "aneu",
                                      sample == "1734" ~ "trip1",
                                      sample == "1747" ~ "trip2",
                                      sample == "1751" ~ "trip3",
                                      sample == "1736" ~ "trip4",
                                      sample == "1744" ~ "iso",
                                      sample == "1740" ~ "quad"
                                      )) , 
       aes(bin, ngene_norm_total_inserts_bin, color = essential)) +
  geom_line() +
  #theme_minimal() +
  facet_wrap(~strain_names_rep) +
  scale_color_manual(values=c('#80b1d3', '#fb8072'),
                     name = "", labels = c("Non-essential", "Essential")) +
  xlab('% of CDS') +
  ylab('Mean unique insertion sites')

sfig3b=binned_inserts %>% 
  ungroup() %>%
  filter(type=="cds") %>%
  group_by_at(vars(sample, type, quartile, bin)) %>%
  mutate(total_inserts_bin = sum(inserts_per_bin), 
         mean_inserts_bin = mean(inserts_per_bin),
         median_inserts_bin = median(inserts_per_bin)) %>%
  dplyr::select(sample, type, essential, bin, total_inserts_bin, mean_inserts_bin, median_inserts_bin, quartile) %>%
  distinct() %>%
  mutate(strain_names_rep = case_when(sample == "1657_1" ~ "eu_1",
                                      sample == "1657_2" ~ "eu_2",
                                      sample == "1728" ~ "aneu",
                                      sample == "1734" ~ "trip1",
                                      sample == "1747" ~ "trip2",
                                      sample == "1751" ~ "trip3",
                                      sample == "1736" ~ "trip4",
                                      sample == "1744" ~ "iso",
                                      sample == "1740" ~ "quad"
                                      )) %>%
  ggplot(aes(bin, mean_inserts_bin, color = quartile)) +
  geom_line() +
  #theme_minimal() +
  facet_wrap(~strain_names_rep) +
  #scale_color_manual(values=c('#80b1d3', '#fb8072'),
   #                  name = "", labels = c("Non-essential", "Essential")) +
  xlab('% of CDS') +
  ylab('Mean unique insertion sites')

quantile(fit_gal$Galactose, na.rm=T)

layout <- "
A
B
"

sfig3=sfig3a + sfig3b + plot_layout(design = layout) + plot_annotation(tag_levels = 'A')

ggsave(paste0(fig_dir,"/SFig3.pdf"), plot = sfig3, width = 12, height = 16, units = "in")
ggsave(paste0(fig_dir,"/SFig3.png"), plot = sfig3, width = 12, height = 16, units = "in")
```
