library(EnhancedVolcano)
library(DESeq2)
library(org.Sc.sgd.db)
library(clusterProfiler)
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggpubr)
theme_set(theme_bw(base_size = 20))
data_dir = "/Volumes/GoogleDrive/My Drive/Gresham Lab_Grace/france_satay/Hermes_mutagenesis_paper/data"
fig_dir = "/Volumes/GoogleDrive/My Drive/Gresham Lab_Grace/france_satay/Hermes_mutagenesis_paper/figures"
source("/Volumes/GoogleDrive/My Drive/Gresham Lab_Grace/france_satay/Hermes_mutagenesis_paper/analysis_w_PS_corrections/functions.R")
seq_runs = c("bgi1","bgi2", "nyc1", "nyc2")
# get all genes excluding dubious orfs and their copy number
strain_cns = read_csv(paste0(data_dir, "/gene_median_relative_depth_DNA_corrected_v3/gene_median_relative_depth_DNA-Table 1.csv")) %>%
select(Gene, contains("cor")) %>%
pivot_longer(-Gene, names_to = "strain", values_to = "copy_number") %>%
mutate(strain = str_extract(strain,"[0-9]{4}")) %>%
left_join(strain_names)
all_genes = strain_cns$Gene
Correlation between different sequencing methods
Using insertions per gene
get_ipg = function(x, ver){
data = read_tsv(paste0(data_dir, "/hpc_output/",ver, "/", x),
col_types = cols(CDS=col_character(),`#insertion` = col_double())) %>%
mutate(sample = str_sub(x, 1, -22), gene=NA,
run=ver)
}
ipg_forcor = NULL
for(i in seq_runs) {
ipg_files = list.files(path = paste0(data_dir, "/hpc_output/", i, "/"), pattern = '*insertionPerGene.txt')
ipg_t = map(ipg_files, get_ipg, ver=i)
ipg_df_t = do.call(rbind, ipg_t)
ipg_forcor = ipg_forcor %>%
bind_rows(ipg_df_t %>% unite(sample_rep, sample, run))
}
# get correlations
ipg_forcor = ipg_forcor %>%
mutate(strain = str_sub(sample_rep, 1, -6),
rep = str_sub(sample_rep, -4, -1)) %>%
left_join(strain_names) %>%
unite(sample_rep, strain_names, rep) %>%
dplyr::select(CDS, sample_rep, `#insertion`) %>%
pivot_wider(names_from = sample_rep, values_from = `#insertion`)
reps = ipg_forcor %>%
dplyr::select(eu_1_nyc1, eu_1_nyc2, eu_2_nyc1, eu_2_nyc2)
cor_plots=GGally::ggpairs(reps,
upper = list(continuous = GGally::wrap("cor", method = "pearson"))) +
ggtitle(LETTERS[1])
ggsave(cor_plots, file=paste0(fig_dir, "/SFigX", LETTERS[1], ".pdf"), height = 10, width = 10)
i=2
for(strain in c("aneu", "trip1", "trip2", "trip3", "trip4", "iso", "quad")) {
reps = ipg_forcor %>%
dplyr::select(colnames(ipg_forcor)[str_detect(colnames(ipg_forcor), strain)])
cor_plots=GGally::ggpairs(reps,
upper = list(continuous = GGally::wrap("cor", method = "pearson"))) +
ggtitle(LETTERS[i])
ggsave(cor_plots, file=paste0(fig_dir, "/SFigX", LETTERS[i], ".pdf"), height = 10, width = 10)
i=i+1
}
get_rpp = function(x, ver) {
filepath=paste0(data_dir,"/hpc_output/",ver,"/",x)
out = read_tsv(filepath, col_names = F, col_types = cols()) %>%
dplyr::rename(chromosome=X1, chr_pos=X2, reads=X3) %>%
mutate(sample = str_sub(x, 1, -16),
version = ver)
}
files = list.files(path = paste0(data_dir, '/hpc_output/combined/'),
pattern = '*readPerPos.txt')
read_per_pos = map(files, get_rpp, ver='combined')
rpp_df = do.call(rbind, read_per_pos)
Get insertion profiles
#get insertion profiles for all genes
insert_profiles = map(all_genes_systematic, get_insert_profile, data = rpp_df)
insert_profiles = do.call(rbind, insert_profiles)
# add in zero categories
t=insert_profiles %>% right_join(tibble(all_genes_systematic), by= c("gene"="all_genes_systematic")) %>%
mutate_at(vars(sample, gene, type), factor)
t=t %>%
tidyr::expand(sample, gene, type)
insert_profiles = insert_profiles %>% #filter(!is.na(sample)) %>%
right_join(t) %>%
mutate(n_insertions = replace_na(n_insertions, 0), normalized_insertions = replace_na(normalized_insertions, 0)) %>%
distinct()
# write out so this doesn't need to be done again
write_csv(insert_profiles, paste0(data_dir,"/insertion_profiles.csv"))
insert_profiles = read_csv(paste0(data_dir,"/insertion_profiles.csv"))
inserts_per_mill_cds = insert_profiles %>%
left_join(strain_names, c("sample" = "strain")) %>%
left_join(summary_rpp) %>%
filter(type %in% c("cds")) %>%
group_by(sample, gene) %>%
mutate(inserts = sum(n_insertions)) %>% ungroup() %>%
select(sample, strain_names, gene, inserts, total_sites) %>%
distinct() %>%
mutate(inserts_per_mill = inserts/(total_sites/1e6))
Look at genes in CNV region
cnv_genes = strain_cns %>% filter(copy_number >= 2) %>% rename(sample = strain)
sfigq_sig = inserts_per_mill_cds %>%
left_join(ess_del) %>%
filter(str_starts(gene, "YK")) %>%
anti_join(cnv_genes) %>%
left_join(strain_names) %>%
group_by(sample) %>%
mutate(ess_del = if_else(is.na(ess_del), "not essential", ess_del)) %>%
rstatix::t_test(inserts_per_mill ~ ess_del) %>%
ungroup() %>%
mutate(significance = case_when(p <= 0.0001 ~ "****",
p > 0.01 ~ "ns"))
sfig4a = inserts_per_mill_cds %>%
left_join(ess_del) %>%
filter(str_starts(gene, "YK")) %>%
anti_join(cnv_genes) %>%
group_by(sample) %>%
mutate(ess_del = if_else(is.na(ess_del), "not essential", ess_del)) %>%
ggplot(aes(strain_names, inserts_per_mill, color = ess_del)) +
ylab("Normalized insertions") +
xlab("") +
geom_point(alpha=0.7, size=1, cex = 0.7, position=position_jitterdodge()) +
geom_boxplot(outlier.shape = NA, alpha=0, lwd=1) +
scale_color_manual(values=c('#80b1d3', '#fb8072'),
name = "", labels = c("Non-essential", "Essential")) +
stat_pvalue_manual(
sfigq_sig %>% rstatix::add_xy_position(x = "sample"),
y.position = 1500,
label = "significance"
) +
theme(legend.background = element_rect(fill="transparent"),
legend.position=c(.85,.73))
sfig4a
# non-amplified genes
tempo = inserts_per_mill_cds %>%
select(strain_names, inserts_per_mill, gene) %>%
anti_join(cnv_genes %>% select(gene, strain_names)) %>%
pivot_wider(names_from = "strain_names", values_from = "inserts_per_mill") %>%
mutate(mean1657 = (eu_1 + eu_2)/2) %>%
pivot_longer(cols=c(aneu, trip1, trip2, trip3, trip4, iso, quad), names_to="strain", values_to = "inserts_per_mill")
amp_regress = NULL
for(samp in unique(tempo$strain)) {
tx = tempo %>% filter(strain == samp) %>%
filter(!is.na(inserts_per_mill))
fit=lm(tx$inserts_per_mill~tx$mean1657)
intercept = fit$coefficients[1]
intercept_pval = summary(fit)$coefficients[,4][1]
slope = fit$coefficients[2]
slope_ci2.5 = confint(fit, 2, level=0.95)[1]
slope_ci97.5 = confint(fit, 2, level=0.95)[2]
slope_pval = summary(fit)$coefficients[,4][2]
adjr2 = summary(fit)$adj.r.squared
resid_sd = sigma(fit)
n_greater2sigma = sum(abs(fit$residuals) > mean(fit$residuals) + resid_sd*2)
n_amp_genes = tx$common_name[abs(fit$residuals) > mean(fit$residuals) + resid_sd*2]
amp_regress = amp_regress %>% bind_rows(tibble(samp, intercept, intercept_pval, slope, slope_pval, slope_ci2.5, slope_ci97.5, adjr2, resid_sd, n_greater2sigma))
}
sfig4b = tempo %>%
#left_join(strain_names) %>%
#filter(strain != "1657_1", strain != "1657_2") %>%
#filter(!is.na(inserts_per_mill), mean1657 < 750) %>%
ggplot(aes(mean1657, inserts_per_mill)) +
#ggiraph::geom_point_interactive(aes(data_id = common_name, tooltip = common_name), alpha = 0.5) +
geom_smooth(method = 'lm', se=FALSE, color="black") +
geom_point(alpha=0.5) +
facet_wrap(~strain) +
theme_bw(base_size=16) +
theme(legend.position = "none") +
ylab("CNV strain normalized insertions") +
xlab("Euploid normalized insertions") +
geom_text(data = amp_regress %>% rename(strain = samp),
aes(150, 1500, label = paste("Adj R2 = ", round(adjr2,2), "\n",
"Slope =", round(slope,2), "\n")),
size=4) +
ggtitle("B")
---
title: "Transposon mutagenesis in *GAP1* CNV strains"
author: "Grace Avecilla"
output: html_notebook
---


```{r}
library(EnhancedVolcano)
library(DESeq2)
library(org.Sc.sgd.db)
library(clusterProfiler)
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggpubr)
theme_set(theme_bw(base_size = 20))

data_dir = "/Volumes/GoogleDrive/My Drive/Gresham Lab_Grace/france_satay/Hermes_mutagenesis_paper/data"

fig_dir = "/Volumes/GoogleDrive/My Drive/Gresham Lab_Grace/france_satay/Hermes_mutagenesis_paper/figures"

source("/Volumes/GoogleDrive/My Drive/Gresham Lab_Grace/france_satay/Hermes_mutagenesis_paper/analysis_w_PS_corrections/functions.R")

seq_runs = c("bgi1","bgi2", "nyc1", "nyc2")
```

```{r}
# get all genes excluding dubious orfs and their copy number
strain_cns = read_csv(paste0(data_dir, "/gene_median_relative_depth_DNA_corrected_v3/gene_median_relative_depth_DNA-Table 1.csv")) %>% 
  select(Gene, contains("cor")) %>%
  pivot_longer(-Gene, names_to = "strain", values_to = "copy_number") %>%
  mutate(strain = str_extract(strain,"[0-9]{4}")) %>%
  left_join(strain_names)

all_genes = strain_cns$Gene
```

# Correlation between different sequencing methods
Using insertions per gene
```{r}
get_ipg = function(x, ver){
  data = read_tsv(paste0(data_dir, "/hpc_output/",ver, "/", x), 
                  col_types = cols(CDS=col_character(),`#insertion` = col_double())) %>%
    mutate(sample = str_sub(x, 1, -22), gene=NA,
           run=ver)
}
ipg_forcor = NULL
for(i in seq_runs) {
  ipg_files = list.files(path = paste0(data_dir, "/hpc_output/", i, "/"), pattern = '*insertionPerGene.txt')
  ipg_t = map(ipg_files, get_ipg, ver=i)
  ipg_df_t = do.call(rbind, ipg_t)
  ipg_forcor = ipg_forcor %>% 
    bind_rows(ipg_df_t %>% unite(sample_rep, sample, run))
}
```

```{r}
# get correlations
ipg_forcor = ipg_forcor %>%
  mutate(strain = str_sub(sample_rep, 1, -6),
         rep = str_sub(sample_rep, -4, -1)) %>%
  left_join(strain_names) %>%
  unite(sample_rep, strain_names, rep) %>%
  dplyr::select(CDS, sample_rep, `#insertion`) %>% 
  pivot_wider(names_from = sample_rep, values_from = `#insertion`) 


reps = ipg_forcor %>%
    dplyr::select(eu_1_nyc1, eu_1_nyc2, eu_2_nyc1, eu_2_nyc2)  
cor_plots=GGally::ggpairs(reps, 
                upper = list(continuous = GGally::wrap("cor", method = "pearson"))) +
    ggtitle(LETTERS[1])
ggsave(cor_plots, file=paste0(fig_dir, "/SFigX", LETTERS[1], ".pdf"), height = 10, width = 10)

i=2
for(strain in c("aneu", "trip1", "trip2", "trip3", "trip4", "iso", "quad")) {
  reps = ipg_forcor %>%
    dplyr::select(colnames(ipg_forcor)[str_detect(colnames(ipg_forcor), strain)])  
  cor_plots=GGally::ggpairs(reps, 
                upper = list(continuous = GGally::wrap("cor", method = "pearson"))) +
    ggtitle(LETTERS[i])
  ggsave(cor_plots, file=paste0(fig_dir, "/SFigX", LETTERS[i], ".pdf"), height = 10, width = 10)
  i=i+1
}

```

```{r get reads per position}
get_rpp = function(x, ver) {
  filepath=paste0(data_dir,"/hpc_output/",ver,"/",x)
  out = read_tsv(filepath, col_names = F, col_types = cols()) %>%
    dplyr::rename(chromosome=X1, chr_pos=X2, reads=X3) %>%
    mutate(sample = str_sub(x, 1, -16),
           version = ver)
}
files = list.files(path = paste0(data_dir, '/hpc_output/combined/'),
                     pattern = '*readPerPos.txt')
read_per_pos = map(files, get_rpp, ver='combined')
rpp_df = do.call(rbind, read_per_pos)

```


# Get insertion profiles
```{r get insertion profiles, message=FALSE}
#get insertion profiles for all genes
insert_profiles = map(all_genes_systematic, get_insert_profile, data = rpp_df)
insert_profiles = do.call(rbind, insert_profiles)

# add in zero categories
t=insert_profiles %>% right_join(tibble(all_genes_systematic), by= c("gene"="all_genes_systematic")) %>%
  mutate_at(vars(sample, gene, type), factor) 
t=t %>% 
  tidyr::expand(sample, gene, type)
insert_profiles = insert_profiles %>% #filter(!is.na(sample)) %>% 
  right_join(t) %>%
  mutate(n_insertions = replace_na(n_insertions, 0), normalized_insertions = replace_na(normalized_insertions, 0)) %>%
  distinct()

# write out so this doesn't need to be done again
write_csv(insert_profiles, paste0(data_dir,"/insertion_profiles.csv"))
```

```{r get insertion profile csv}
insert_profiles = read_csv(paste0(data_dir,"/insertion_profiles.csv"))
```

```{r}
inserts_per_mill_cds = insert_profiles %>% 
  left_join(strain_names, c("sample" = "strain")) %>%
  left_join(summary_rpp) %>%
  filter(type %in% c("cds")) %>%
  group_by(sample, gene) %>%
  mutate(inserts = sum(n_insertions)) %>% ungroup() %>%
  select(sample, strain_names, gene, inserts, total_sites) %>%
    distinct() %>%
  mutate(inserts_per_mill = inserts/(total_sites/1e6))
```


# Look at genes in CNV region

```{r}
cnv_genes = strain_cns %>% filter(copy_number >= 2) %>% rename(sample = strain)
```


```{r Supplemental_Fig_S10A}
sfigq_sig = inserts_per_mill_cds %>% 
  left_join(ess_del) %>%
  filter(str_starts(gene, "YK")) %>%
  anti_join(cnv_genes) %>%
  left_join(strain_names) %>%
  group_by(sample) %>% 
  mutate(ess_del = if_else(is.na(ess_del), "not essential", ess_del)) %>% 
  rstatix::t_test(inserts_per_mill ~ ess_del) %>%
  ungroup() %>%
  mutate(significance = case_when(p <= 0.0001 ~ "****",
                                  p > 0.01 ~ "ns"))

sfig4a = inserts_per_mill_cds %>% 
  left_join(ess_del) %>%
  filter(str_starts(gene, "YK")) %>%
  anti_join(cnv_genes) %>%
  group_by(sample) %>% 
  mutate(ess_del = if_else(is.na(ess_del), "not essential", ess_del)) %>%
  ggplot(aes(strain_names, inserts_per_mill, color = ess_del)) +
  ylab("Normalized insertions") +
  xlab("") +
  geom_point(alpha=0.7, size=1, cex = 0.7, position=position_jitterdodge()) +
  geom_boxplot(outlier.shape = NA, alpha=0, lwd=1) +
  scale_color_manual(values=c('#80b1d3', '#fb8072'),
                     name = "", labels = c("Non-essential", "Essential")) +
  stat_pvalue_manual(
    sfigq_sig %>% rstatix::add_xy_position(x = "sample"), 
    y.position = 1500,
    label = "significance"
    ) +
  theme(legend.background = element_rect(fill="transparent"),
        legend.position=c(.85,.73)) 

sfig4a
```

```{r Supplemental_Fig_S10B}
# non-amplified genes
tempo = inserts_per_mill_cds %>% 
  select(strain_names, inserts_per_mill, gene) %>%
  anti_join(cnv_genes %>% select(gene, strain_names)) %>%
  pivot_wider(names_from = "strain_names", values_from = "inserts_per_mill") %>%
  mutate(mean1657 = (eu_1 + eu_2)/2) %>%
  pivot_longer(cols=c(aneu, trip1, trip2, trip3, trip4, iso, quad), names_to="strain", values_to = "inserts_per_mill")

amp_regress = NULL
for(samp in unique(tempo$strain)) {
  tx = tempo %>% filter(strain == samp) %>%
    filter(!is.na(inserts_per_mill))
  fit=lm(tx$inserts_per_mill~tx$mean1657)
  intercept = fit$coefficients[1]
  intercept_pval = summary(fit)$coefficients[,4][1]
  slope = fit$coefficients[2]
  slope_ci2.5 = confint(fit, 2, level=0.95)[1]
  slope_ci97.5 = confint(fit, 2, level=0.95)[2]
  slope_pval = summary(fit)$coefficients[,4][2]
  adjr2 = summary(fit)$adj.r.squared
  resid_sd = sigma(fit)
  n_greater2sigma = sum(abs(fit$residuals) > mean(fit$residuals) + resid_sd*2)
  n_amp_genes = tx$common_name[abs(fit$residuals) > mean(fit$residuals) + resid_sd*2]
  amp_regress = amp_regress %>% bind_rows(tibble(samp, intercept, intercept_pval, slope, slope_pval, slope_ci2.5, slope_ci97.5, adjr2, resid_sd, n_greater2sigma))
}

sfig4b = tempo %>% 
  #left_join(strain_names) %>%
  #filter(strain != "1657_1", strain != "1657_2") %>%
  #filter(!is.na(inserts_per_mill), mean1657 < 750) %>%
  ggplot(aes(mean1657, inserts_per_mill)) +
  #ggiraph::geom_point_interactive(aes(data_id = common_name, tooltip = common_name), alpha = 0.5) +
  geom_smooth(method = 'lm', se=FALSE, color="black") +
  geom_point(alpha=0.5) +
     facet_wrap(~strain) +
     theme_bw(base_size=16) +
     theme(legend.position = "none") +
  ylab("CNV strain normalized insertions") +
  xlab("Euploid normalized insertions") +
  geom_text(data = amp_regress %>% rename(strain = samp), 
             aes(150, 1500, label = paste("Adj R2 = ", round(adjr2,2), "\n",
                                    "Slope =", round(slope,2), "\n")),
             size=4) +
  ggtitle("B")
```

