filtered_data = data %>% filter(max_expr_over_samples >= tpm_x) %>% arrange(desc(length))
sum_length = sum(filtered_data$length)
filtered_data = filtered_data %>% mutate(cumsum_len = cumsum(length))
half_length = sum_length / 2
N50_entry = filtered_data %>% filter(cumsum_len <= half_length) %>% filter(row_number() == n())
Ex = threshold_entry %>% pull(X.Ex)
N50 = N50_entry %>% pull(length)
num_transcripts =  data %>% filter(max_expr_over_samples >= tpm_x) %>% nrow()
p = data %>% ggplot(aes(x=logexpr, y=r)) + geom_point() +
geom_vline(xintercept=threshold_entry$logexpr, color='red') +
annotate("text", x=threshold_entry$logexpr, y=num_transcripts,
label=paste0("    # transcripts =", num_transcripts, " Ex=", Ex, " N50=", N50, " TPMt=", tpm_x), hjust=0)
p
# estimate ExN50 based on that threshold and number of transcripts
filtered_data = data %>% filter(max_expr_over_samples >= tpm_x) %>% arrange(desc(length))
sum_length = sum(filtered_data$length)
filtered_data = filtered_data %>% mutate(cumsum_len = cumsum(length))
half_length = sum_length / 2
N50_entry = filtered_data %>% filter(cumsum_len <= half_length) %>% filter(row_number() == n())
Ex = threshold_entry %>% pull(X.Ex)
N50 = N50_entry %>% pull(length)
num_transcripts =  data %>% filter(max_expr_over_samples >= tpm_x) %>% nrow()
p = data %>% ggplot(aes(x=logexpr, y=r)) + geom_point() +
geom_vline(xintercept=threshold_entry$logexpr, color='red') +
annotate("text", x=threshold_entry$logexpr, y=num_transcripts,
label=paste0("    # transcripts =", num_transcripts, " Ex=", Ex, " N50=", N50, " TPMthresh=", tpm_x), hjust=0)
p
log2(0.69+1)
threshold_entry
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
data = read.table(gzfile("M0834-NP-WGS-Montagna-SCC47.bedgraph.gz"), header=F, sep="\t")
colnames(data) = c('chrom', 'start', 'end', 'cov')
chr3_data = data %>% filter(chrom == "chr3")
chr3_data %>% ggplot(aes(x=start, y=cov)) + geom_line()
data = data %>% mutate(log2cov = log2(cov+1))
plot(density(data$log2cov))
mode(data$log2cov)
data = data %>% mutate(log2cov = log2(cov+1))
density_log2cov = density(data$log2cov)
plot(density_log2cov)
density_log2cov
str(density_log2cov)
which.max(density_log2cov$y)
density_log2cov$x[which.max(density_log2cov$y)]
data = data %>% mutate(log2cov = log2(cov+1))
density_log2cov = density(data$log2cov)
log2x_peak = density_log2cov$x[which.max(density_log2cov$y)]
x_peak = 2^log2x_peak -1
plot(density_log2cov)
abline(v=log2x_peak)
data = data %>% mutate(log2cov = log2(cov+1))
density_log2cov = density(data$log2cov)
log2x_peak = density_log2cov$x[which.max(density_log2cov$y)]
x_peak = 2^log2x_peak -1
plot(density_log2cov)
abline(v=log2x_peak, col='red')
x_peak
data = data %>% mutate(copies = cov / (x_peak / 2) )
chr3_data %>% ggplot(aes(x=start, y=copies)) + geom_line()
chr3_data = data %>% filter(chrom == "chr3")
chr3_data %>% ggplot(aes(x=start, y=copies)) + geom_line()
# TP63 at: chr3:189,789,659-189,897,276
chr3_data %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=189789659, color='orange')
TP63_pos = 189789659
WINDOWSIZE=10000000
chr3_data %>% filter(start > TP63_pos - WINDOWSIZE & start < TP63 + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_pos, color='orange')
WINDOWSIZE=10000000
chr3_data %>% filter(start > TP63_pos - WINDOWSIZE & start < TP63_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_pos, color='orange')
WINDOWSIZE=1000000
chr3_data %>% filter(start > TP63_pos - WINDOWSIZE & start < TP63_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_pos, color='orange')
WINDOWSIZE=100000
chr3_data %>% filter(start > TP63_pos - WINDOWSIZE & start < TP63_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_pos, color='orange')
WINDOWSIZE=1000000
chr3_data %>% filter(start > TP63_pos - WINDOWSIZE & start < TP63_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_pos, color='orange')
# TP63 at: chr3:189,789,659-189,897,276
TP63_start_pos = 189789659
TP63_end_pos = 189897276
chr3_data %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos)
# TP63 at: chr3:189,789,659-189,897,276
TP63_start_pos = 189789659
TP63_end_pos = 189897276
chr3_data %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos)
# TP63 at: chr3:189,789,659-189,897,276
TP63_start_pos = 189789659
TP63_end_pos = 189897276
chr3_data %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange')
WINDOWSIZE=1000000
chr3_data %>% filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange')
WINDOWSIZE=1000000
chr3_data %>% filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple')
chr3_data %>% filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple') + ylim(0,10)
# coverage density w/o log
density_raw_cov = density(data$cov %>% filter(cov < 10000))
# coverage density w/o log
density_raw_cov = density(data %>% filter(cov < 10000) %>% pull(cov))
plot(density_raw_cov)
# coverage density w/o log
density_raw_cov = density(data %>% filter(cov < 1000) %>% pull(cov))
plot(density_raw_cov)
# coverage density w/o log
density_raw_cov = density(data %>% filter(cov < 400) %>% pull(cov))
plot(density_raw_cov)
# coverage density w/o log
density_raw_cov = density(data %>% filter(cov < 100) %>% pull(cov))
plot(density_raw_cov)
# coverage density w/o log
density_raw_cov = density(data %>% filter(cov < 100) %>% pull(cov))
plot(density_raw_cov)
x_peak = density_raw_cov$x[which.max(density_raw_cov$y)]
abline(v=x_peak, col='red')
data = data %>% mutate(copies = cov / (x_peak / 2) )
WINDOWSIZE=1000000
chr3_data %>% filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple')
chr3_data %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange')
data %>% filter(chrom=='chr3') %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange')
data %>% filter(chrom=='chr3') %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple')
data %>% filter(chrom=='chr3') %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple') + ylim(0,10)
data %>% filter(chrom=='chr1') %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_hline(yintercept=2, color='purple') + ylim(0,10)
chrs = data %>% select(chrom) %>% unique() %>% pull('chrom')
pdf("chrom_cov_plots.pdf")
for (chr in chrs) {
p = data %>% filter(chrom==chr) %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_hline(yintercept=2, color='purple') + ylim(0,10) +
ggtitle(chr)
plot(p)
}
dev.off()
chrs = data %>% select(chrom) %>% unique() %>% filter(! grepl("_", chrom) %>% pull('chrom')
pdf("chrom_cov_plots.pdf")
chrs = data %>% select(chrom) %>% unique() %>% filter(! grepl("_", chrom)) %>% pull('chrom')
pdf("chrom_cov_plots.pdf")
for (chr in chrs) {
p = data %>% filter(chrom==chr) %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_hline(yintercept=2, color='purple') + ylim(0,10) +
ggtitle(chr)
plot(p)
}
dev.off()
data = read.table(gzfile("M0834-NP-WGS-Montagna-SCC47.bedgraph.gz"), header=F, sep="\t")
colnames(data) = c('chrom', 'start', 'end', 'cov')
data$cov = smooth(data$cov)
density_raw_cov = density(data %>% filter(cov < 100) %>% pull(cov))
plot(density_raw_cov)
x_peak = density_raw_cov$x[which.max(density_raw_cov$y)]
abline(v=x_peak, col='red')
hist(data$cov)
hist(data$cov[data$cov <= 100])
hist(data$cov[data$cov <= 200])
hist(data$cov[data$cov <= 200], breaks = 100)
hist(data$cov[data$cov <= 200], breaks = 1000)
hist(data$cov[data$cov <= 200], breaks = 1000)
abline(v=x_peak, col='red')
hist(data$cov[data$cov <= 200], breaks = 1000)
abline(v=x_peak, col='red')
abline(v=x_peak/2, col='orange')
abline(v=x_peak*2, col='purple')
hist(data$cov[data$cov <= 200], breaks = 1000)
abline(v=x_peak, col='red')
abline(v=x_peak/2, col='orange')
abline(v=x_peak*3/2, col='orange')
abline(v=x_peak*2, col='purple')
hist(data$cov[data$cov <= 200], breaks = 1000)
abline(v=x_peak, col='red')
abline(v=x_peak/2, col='orange')
abline(v=x_peak*3/2, col='orange')
abline(v=x_peak*2, col='purple')
abline(v=x_peak*5/2, col='purple')
WINDOWSIZE=1000000
chr3_data %>% filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple')
WINDOWSIZE=1000000
data %>% filter(chrom=='chr3') %>%
filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple')
WINDOWSIZE=1000000
# chr3:189,789,659-189,897,276
TP63_start_pos = 189789659
TP63_end_pos=189897276
data %>% filter(chrom=='chr3') %>%
filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple')
data = data %>% mutate(copies = cov/x_peak)
WINDOWSIZE=1000000
# chr3:189,789,659-189,897,276
TP63_start_pos = 189789659
TP63_end_pos=189897276
data %>% filter(chrom=='chr3') %>%
filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple')
data = data %>% mutate(copies = cov/x_peak/2)
WINDOWSIZE=1000000
# chr3:189,789,659-189,897,276
TP63_start_pos = 189789659
TP63_end_pos=189897276
data %>% filter(chrom=='chr3') %>%
filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple')
data = data %>% mutate(copies = cov/(x_peak/2)
data = data %>% mutate(copies = cov/(x_peak/2))
WINDOWSIZE=1000000
# chr3:189,789,659-189,897,276
TP63_start_pos = 189789659
TP63_end_pos=189897276
data %>% filter(chrom=='chr3') %>%
filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple')
data %>% filter(chrom=='chr3') %>% filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple') + ylim(0,10)
data %>% filter(chrom=='chr3') %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple') + ylim(0,10)
data %>% filter(chrom=='chr1') %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_hline(yintercept=2, color='purple') + ylim(0,10)
chrs = data %>% select(chrom) %>% unique() %>% filter(! grepl("_", chrom)) %>% pull('chrom')
pdf("chrom_cov_plots.pdf")
for (chr in chrs) {
p = data %>% filter(chrom==chr) %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_hline(yintercept=2, color='purple') + ylim(0,10) +
ggtitle(chr)
plot(p)
}
dev.off()
?smooth
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
data = read.table(gzfile("M0834-NP-WGS-Montagna-SCC47.bedgraph.gz"), header=F, sep="\t")
colnames(data) = c('chrom', 'start', 'end', 'cov')
data$cov = smooth(data$cov, twiceit=T)
density_raw_cov = density(data %>% filter(cov < 100) %>% pull(cov))
plot(density_raw_cov)
x_peak = density_raw_cov$x[which.max(density_raw_cov$y)]
abline(v=x_peak, col='red')
hist(data$cov[data$cov <= 200], breaks = 1000)
abline(v=x_peak, col='red')
abline(v=x_peak/2, col='orange')
abline(v=x_peak*3/2, col='orange')
abline(v=x_peak*2, col='purple')
abline(v=x_peak*5/2, col='purple')
data = data %>% mutate(copies = cov/(x_peak/2))
WINDOWSIZE=1000000
# chr3:189,789,659-189,897,276
TP63_start_pos = 189789659
TP63_end_pos=189897276
data %>% filter(chrom=='chr3') %>%
filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple')
data %>% filter(chrom=='chr3') %>% filter(start > TP63_start_pos - WINDOWSIZE & start < TP63_start_pos + WINDOWSIZE) %>%
ggplot(aes(x=start, y=copies)) + geom_line()  + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple') + ylim(0,10)
data %>% filter(chrom=='chr3') %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_vline(xintercept=TP63_start_pos, color='orange') + geom_vline(xintercept = TP63_end_pos, color='orange') + geom_hline(yintercept=2, color='purple') + ylim(0,10)
data %>% filter(chrom=='chr1') %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_hline(yintercept=2, color='purple') + ylim(0,10)
chrs = data %>% select(chrom) %>% unique() %>% filter(! grepl("_", chrom)) %>% pull('chrom')
pdf("chrom_cov_plots.pdf")
for (chr in chrs) {
p = data %>% filter(chrom==chr) %>% ggplot(aes(x=start, y=copies)) + geom_line() + geom_hline(yintercept=2, color='purple') + ylim(0,10) +
ggtitle(chr)
plot(p)
}
dev.off()
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(cowplot)
HOTSPOT_WINSIZE = "1e5"
hotspot_info = read.table( paste0("plot.hotspots.win_", HOTSPOT_WINSIZE, ".tsv.w_20_neighbors.regrouped_by_insert_gene.hotspot_virus_sample_counts.tsv"), header=T, sep="\t", stringsAsFactors = F)
# ex. plot.hotspots.win_1e5.tsv.w_20_neighbors.regrouped_by_insert_gene.hotspot_virus_sample_counts.tsv
head(hotspot_info)
hotspot_insertions = read.table(paste0("hotspots.win_", HOTSPOT_WINSIZE, ".tsv.w_20_neighbors.regrouped_by_insert_gene"),
header=T, sep="\t", stringsAsFactors = F)
# ex. hotspots.win_1e5.tsv.w_20_neighbors.regrouped_by_insert_gene
# use the regrouped hotspot name as the hotspot name:
hotspot_insertions = hotspot_insertions %>% mutate(hotspot = gene_regrouped_hotspot_name)
head(hotspot_insertions)
#insertion_hotspot_assignments = read.table(paste0("hotspots.win_", HOTSPOT_WINSIZE, ".tsv"), header=T, sep="\t", stringsAsFactors = F)
#head(insertion_hotspot_assignments)
hotspot_info %>% select(hotspot) %>% unique() %>% nrow()
hotspot_sum_sample_virus_counts = hotspot_info %>% select(hotspot, virus_genome, hotspot_virus_tally) %>% group_by(hotspot) %>%
summarize(sum_sample_viruses = sum(hotspot_virus_tally)) %>% arrange(desc(sum_sample_viruses)) %>% ungroup()
hotspot_sum_sample_virus_counts
hotspot_sum_sample_virus_counts %>% ggplot(aes(x=reorder(hotspot, -1*sum_sample_viruses), sum_sample_viruses)) + geom_col() +
ggtitle("sum(virus,sample) ranked hotspots") +
scale_y_continuous(trans='log10')
hotspot_sum_sample_virus_counts  %>% group_by(sum_sample_viruses) %>% tally()
hotspot_sum_sample_virus_counts  %>% group_by(sum_sample_viruses) %>% tally() %>%
ggplot(aes(x=as.factor(sum_sample_viruses), y=n)) + geom_col() +
ggtitle("Number of hotspots with number of sample/viruses at sites") +
xlab("number of sample/viruses at site") + ylab("number of hotspots")
p = hotspot_sum_sample_virus_counts  %>% group_by(sum_sample_viruses) %>% tally() %>%
ggplot(aes(x=as.factor(sum_sample_viruses), y=n)) + geom_col() +
ggtitle("Number of hotspots with number of sample/viruses at sites") +
xlab("number of sample/viruses at site") + ylab("number of hotspots") +
scale_y_continuous(trans='log10')
p
# include random data:
random_hotspot_data = read.table(gzfile(paste0("data/rand_cluster_size_counts.win_",  HOTSPOT_WINSIZE, ".tsv.gz")), header=F, sep="\t")
colnames(random_hotspot_data) = c('trial', 'hotspot_size', 'num_hotspots')
random_hotspot_data = random_hotspot_data %>% filter(hotspot_size >= 3)
p = p + geom_boxplot(data=random_hotspot_data, aes(x=as.factor(hotspot_size), y=num_hotspots), color='red')
p
top_hotspots = hotspot_info %>% group_by(hotspot) %>% summarize(sum_insertions = sum(hotspot_virus_tally)) %>% ungroup()
top_hotspots %>% arrange(desc(sum_insertions))
hotspot_info =  left_join(hotspot_info, top_hotspots, by='hotspot')
hotspot_info %>% filter(sum_insertions >= 10) %>%
ggplot(aes(x=reorder(hotspot, -1*sum_insertions), y=hotspot_virus_tally, fill=virus_genome)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
hotspot_info %>% filter(sum_insertions >= 10) %>%
mutate(virus_genome = ifelse(grepl("HPV", virus_genome), "HPV", virus_genome)) %>%
ggplot(aes(x=reorder(hotspot, -1*sum_insertions), y=hotspot_virus_tally, fill=virus_genome)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
MIN_HOTSPOT_SIZE = 11
hotspot_info_top = hotspot_info %>% filter(sum_insertions >= MIN_HOTSPOT_SIZE)
ordered_hotspots = hotspot_info_top %>% arrange(desc(sum_insertions)) %>% select(hotspot) %>% unique() %>% pull(hotspot)
hotspot_info_top$hotspot = factor(hotspot_info_top$hotspot, levels=ordered_hotspots)
hotspot_info_top %>%
#mutate(virus_genome = ifelse(grepl("HPV", virus_genome), "HPV", virus_genome)) %>%
ggplot(aes(x=hotspot, y=hotspot_virus_tally, fill=virus_genome)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
hotspot_by_virus_plot = hotspot_info_top %>%
mutate(virus_genome = ifelse(grepl("HPV", virus_genome), "HPV", virus_genome)) %>%
ggplot(aes(x=hotspot, y=hotspot_virus_tally, fill=virus_genome)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
hotspot_by_virus_plot
hotspots_of_interest = hotspot_info_top %>% select(hotspot) %>% unique() %>% pull(hotspot)
hotspot_insertion_seqtypes = left_join(hotspot_info_top %>% select(hotspot),
hotspot_insertions %>% filter(gene_regrouped_hotspot_name %in% hotspots_of_interest) %>%
select(hotspot = gene_regrouped_hotspot_name, seqtype),
by=c('hotspot') )
hotspot_seqtype_frac_info = hotspot_insertion_seqtypes %>% group_by(hotspot, seqtype) %>% tally() %>% mutate(pct=prop.table(n))
hotspot_seqtype_frac_info
hotspot_seqtype_frac_info$hotspot = factor(hotspot_seqtype_frac_info$hotspot, levels=ordered_hotspots)
hotspot_by_seqtype_plot = hotspot_seqtype_frac_info %>% ggplot(aes(x=hotspot, y=pct, fill=seqtype)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
hotspot_by_seqtype_plot
#  cohort representation
hotspot_insertion_cohorts = left_join(hotspot_info_top %>% select(hotspot, hotspot),
hotspot_insertions %>% filter(gene_regrouped_hotspot_name %in% hotspots_of_interest) %>%
select(hotspot = gene_regrouped_hotspot_name, cohort),
by='hotspot')
hotspot_cohort_frac_info = hotspot_insertion_cohorts %>% group_by(hotspot, cohort) %>% tally() %>% mutate(pct=prop.table(n))
hotspot_cohort_frac_info
hotspot_cohort_frac_info$hotspot = factor(hotspot_cohort_frac_info$hotspot, levels=ordered_hotspots)
hotspot_cohort_frac_info %>% ggplot(aes(x=hotspot, y=pct, fill=cohort)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
#  cohort representation
hotspot_insertions = hotspot_insertions  %>% mutate(proj_name = ifelse(cohort=='TCGA', paste(cohort, project, sep="^"), cohort))
hotspot_insertion_projects = left_join(hotspot_info_top %>% select(hotspot, hotspot),
hotspot_insertions %>% filter(gene_regrouped_hotspot_name %in% hotspots_of_interest) %>%
select(hotspot=gene_regrouped_hotspot_name, proj_name),
by='hotspot')
hotspot_project_frac_info = hotspot_insertion_projects %>% group_by(hotspot, proj_name) %>% tally() %>% mutate(pct=prop.table(n))
hotspot_project_frac_info
hotspot_project_frac_info$hotspot = factor(hotspot_project_frac_info$hotspot, levels=ordered_hotspots)
hotspot_by_project_plot = hotspot_project_frac_info %>% ggplot(aes(x=hotspot, y=pct, fill=proj_name)) + geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
hotspot_by_project_plot
pg = plot_grid(
hotspot_by_seqtype_plot + theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank()),
hotspot_by_project_plot + theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank()),
hotspot_by_virus_plot,
ncol = 1, align='v',
rel_heights=c(0.15, 0.15, 0.5))
pg
ggsave(pg, file="top_hotspots_by_attributes.pdf", width=12, height=15)
hotspot_info_top_with_insertions = left_join(hotspot_info_top %>% select(hotspot) %>% unique(),
hotspot_insertions %>% select(sample_id, contig, hotspot),
by='hotspot'
)
nrow(hotspot_info_top_with_insertions)
# add insertion cnv info
cnv_info = read.table("../Insertion_and_CNVs/TCGA_insertions_and_CNV_within10kb", header=T, sep="\t", stringsAsFactors = F)  %>% unique()
hotspot_info_top_with_insertions_and_CNV = left_join(hotspot_info_top_with_insertions,
cnv_info,
by=c('sample_id'='sample', 'contig'='insertion') )
nrow(hotspot_info_top_with_insertions_and_CNV)
# choose peak cnv copy number where multiple insertions analyzed for a single sample
hotspot_info_top_with_insertions_and_CNV = hotspot_info_top_with_insertions_and_CNV  %>% group_by(hotspot, sample_id) %>% arrange(desc(copy_number)) %>% filter(row_number()==1) %>% ungroup()
nrow(hotspot_info_top_with_insertions_and_CNV)
hotspot_info_top_with_insertions_and_CNV$hotspot = factor(hotspot_info_top_with_insertions_and_CNV$hotspot, levels=ordered_hotspots)
hotspot_cnv_boxplot = hotspot_info_top_with_insertions_and_CNV %>% ggplot(aes(x=hotspot, y=copy_number)) + geom_boxplot() +
geom_hline(yintercept=2, linetype=2, color='purple') +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
hotspot_cnv_boxplot
insertion_expr_info = read.table("../Insertion_and_EXPR_effects/virus_insertions_unfiltered.10kb.expression_region_analysis.tsv",
header=T, sep="\t", stringsAsFactors = F) %>% unique()
nrow(insertion_expr_info)
hotspot_info_top_with_insertions_and_CNV_and_EXPR = left_join(hotspot_info_top_with_insertions_and_CNV,
insertion_expr_info %>% select(sample_id, contig, expr_quantile),
by=c('sample_id', 'contig'))
nrow(hotspot_info_top_with_insertions_and_CNV_and_EXPR)
# take single highest expr quantile for sample,hotspot combo
hotspot_info_top_with_insertions_and_CNV_and_EXPR = hotspot_info_top_with_insertions_and_CNV_and_EXPR %>%
group_by(hotspot, sample_id) %>% arrange(desc(expr_quantile)) %>% filter(row_number()==1) %>% ungroup()
nrow(hotspot_info_top_with_insertions_and_CNV_and_EXPR)
hotspot_info_top_with_insertions_and_CNV_and_EXPR$hotspot = factor(hotspot_info_top_with_insertions_and_CNV_and_EXPR$hotspot, levels=ordered_hotspots)
hotspot_expr_quantile_plot = hotspot_info_top_with_insertions_and_CNV_and_EXPR %>% ggplot(aes(x=hotspot, y=expr_quantile)) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
hotspot_expr_quantile_plot
sample_ids_mult_hotspots = hotspot_info_top_with_insertions_and_CNV_and_EXPR %>% select(hotspot, sample_id) %>%  unique() %>%
group_by(sample_id) %>% tally() %>% filter(n>1) %>% ungroup()
sample_ids_mult_hotspots
hotspot_info_top_with_insertions_and_CNV_and_EXPR %>% filter(sample_id %in% sample_ids_mult_hotspots$sample_id) %>%
select(sample_id, hotspot, copy_number, expr_quantile) %>% arrange(sample_id)
# Bring all the plots together.
pg2 = plot_grid(
hotspot_by_seqtype_plot + theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank()),
hotspot_by_project_plot + theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank()),
hotspot_cnv_boxplot + theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank()),
hotspot_expr_quantile_plot + theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank()),
hotspot_by_virus_plot,
ncol = 1,
align='v',
axis='lr',
rel_heights=c(0.15, 0.15,
0.1,
0.1,
0.5
))
pg2
ggsave(pg2, file="hotspot_summary_plot.pdf", width=20, height=15)
ordered_hotspot_insertions = hotspot_insertions %>%
select(hotspot, hotspot_sample_counts_redef, hotspot_sample_counts_redef_list) %>% unique()
head(ordered_hotspot_insertions)
ordered_hotspot_insertions = hotspot_insertions %>%
select(hotspot, hotspot_sample_counts_redef, hotspot_sample_counts_redef_list) %>% unique()
ordered_hotspot_insertions$hotspot_chromosome = sapply(ordered_hotspot_insertions$hotspot, function(x) {
str_split(x, ":")[[1]][1]
} )
ordered_hotspot_insertions$hotspot_coord = sapply(ordered_hotspot_insertions$hotspot, function(x) {
as.numeric(str_split(x, ":|\\^")[[1]][2])
} )
ordered_hotspot_insertions = ordered_hotspot_insertions %>% arrange(hotspot_chromosome, hotspot_coord)
n = nrow(ordered_hotspot_insertions)
lookback = 10
ordered_hotspot_insertions %>% filter(grepl("^chr8:", hotspot)) %>% view()
# Are there similar sample compositions for neighboring hotspots suggesting that they should be combined into single hotspots?
MIN_J = 0.5
find_related_hotspots = function(x) {
jaccard_pair_df = NULL
index_row = ordered_hotspot_insertions[x,]
index_row_sample_list = str_split(index_row$hotspot_sample_counts_redef_list, ",")
for (i in seq(x-lookback+1, x-1)) {
compare_row = ordered_hotspot_insertions[i,]
compare_row_sample_list = str_split(compare_row, ",")
u = union(index_row_sample_list, compare_row_sample_list)
insect = intersect(index_row_sample_list, compare_row_sample_list)
j = length(insect)/length(u)
if (j >= MIN_J) {
jaccard_pair_df = bind_rows(jaccard_pair_df,
data.frame(hotspot_A = index_row$hotspot, hotspot_B = compare_row$hotspot, j=j) )
}
}
jaccard_pair_df
}
hotspot_sample_jaccard = do.call(bind_rows, lapply(seq(lookback, n, 1), find_related_hotspots))
#lapply(seq(lookback, 20, 1), find_related_hotspots)
hotspot_sample_jaccard
# nope
hotspot_cnv_boxplot = hotspot_info_top_with_insertions_and_CNV %>% ggplot(aes(x=hotspot, y=copy_number)) + geom_boxplot() +
geom_hline(yintercept=2, linetype=2, color='purple') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_y_continuous(trans='log10')
hotspot_cnv_boxplot
# Bring all the plots together.
pg2 = plot_grid(
hotspot_by_seqtype_plot + theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank()),
hotspot_by_project_plot + theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank()),
hotspot_cnv_boxplot + theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank()),
hotspot_expr_quantile_plot + theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank()),
hotspot_by_virus_plot,
ncol = 1,
align='v',
axis='lr',
rel_heights=c(0.15, 0.15,
0.1,
0.1,
0.5
))
pg2
ggsave(pg2, file="hotspot_summary_plot.pdf", width=20, height=15)
setwd("/Users/bhaas/BroadInstProjs/CTAT-LR-Fusion-Paper/github.CTAT-LRF-Paper/3.DepMap9Lines/3a.CTAT_DepMap9Lines/3a.1.CTAT_LRF_STARF_FI_comparison")
setwd("~/BroadInstProjs/CTAT-LR-Fusion-Paper/github.CTAT-LRF-Paper/3.DepMap9Lines/3a.CTAT_DepMap9Lines/3a.1.CTAT_LRF_STARF_FI_comparison")
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
ctat_LR_FI_data = read.table("../DepMap_v1v2mrgd.ctatLRF_FI.consolidated.tsv.gz", header=T, sep="\t", stringsAsFactors = F, com='') %>%
rename(FusionName = fusion) %>%
mutate(num_SR = ifelse(is.na(num_SR), 0, num_SR))
ctat_LR_FI_data  %>% head()
FI_data = read.table("data/DepMap.v1v2mrgd.FI.consolidated.tsv.gz", header=T, sep="\t", stringsAsFactors = F, com='') %>%
rename(sample = X.sample, FusionName = X.FusionName, SR_FFPM = FFPM) %>%
mutate(num_SR = est_J + est_S)
head(FI_data)
starF_data = read.table("data/DepMap.v1v2mrgd.StarF.consolidated.tsv.gz", header=T, sep="\t", stringsAsFactors = F, com='') %>%
rename(sample = X.sample, FusionName = X.FusionName, SR_FFPM = FFPM) %>%
mutate(num_SR = est_J + est_S)
head(starF_data)
all_data = bind_rows(ctat_LR_FI_data  %>% select(sample, FusionName, LeftBreakpoint, RightBreakpoint, num_LR, LR_FFPM) %>%
rename(num_reads = num_LR, FFPM = LR_FFPM) %>% mutate(type='LRF'),
ctat_LR_FI_data %>% select(sample, FusionName, LeftBreakpoint, RightBreakpoint, num_SR, SR_FFPM) %>%
rename(num_reads = num_SR, FFPM = SR_FFPM) %>% filter(num_reads > 0) %>%
mutate(type='LRF_FI'),
FI_data %>% select(sample, FusionName, LeftBreakpoint, RightBreakpoint, num_SR, SR_FFPM) %>%
rename(num_reads = num_SR, FFPM = SR_FFPM) %>%
mutate(type='FI'),
starF_data %>% select(sample, FusionName, LeftBreakpoint, RightBreakpoint, num_SR, SR_FFPM) %>%
rename(num_reads = num_SR, FFPM = SR_FFPM) %>%
mutate(type='STARF')
)
all_data %>% head()
all_data_spread = all_data %>% select(sample, FusionName, LeftBreakpoint, RightBreakpoint, num_reads, type) %>%
spread(key=type, value=num_reads, fill=0)
