suppressPackageStartupMessages({
library(Dozer)
library(Matrix)
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(dplyr)
library(knitr)
library(foreach)
library(doParallel)
library(cluster)
library(limma)
library(enrichR)
library(Rtsne)
})
load(system.file("extdata", "Jerber_demo.rda", package = "Dozer"))
theme_set(theme_pubr(base_size = 12))## Barplot for the differentiation efficiency among donors.
ggplot(donor_info, aes(diff_efficiency, fill = phenotype))+
geom_histogram()+labs(fill = 'Phenotype')+
xlab('Differentiation efficiency')+ylab('Number of donors')
## Visualize all cells in a scatter plot with color labels for donors.
ggplot(metadata, aes(tSNE_1, tSNE_2, color = phenotype))+geom_point()+
ggtitle('Scatter plot of cells')+theme(legend.position ='right')+labs(color = 'Phenotype')
path = paste0(tempdir(), '/dozer_tmp')
if (! file.exists(path)){
dir.create(path)
}
cl <- makeCluster(detectCores())
registerDoParallel(cl)
## Load noise ratio into a "gene by donor" matrix.
noise_ratio_gene_by_donor = foreach (i = 1:nrow(donor_info), .packages = c('Matrix'), .combine = 'cbind') %dopar%{
donor = donor_info$donor_id[i]
data = matrix(counts[ , metadata$donor_id == donor], nrow = nrow(counts))
meta = metadata[metadata$donor_id == donor, ]
## If there are several sample_index presented in one dataset, regress it out.
res = Dozer::compute_gene_correlation(data,
covs = data.frame(meta$sample_id),
gene_group_quantile = c(.5, .8))
## Saver co-expression matrices to file
save(res, file = paste0(path, donor, '-coexpr.rda'))
res$ratio[,1]
}
stopCluster(cl)
## Keep genes whose average noise ratio smaller than 0.9.
keep_gene = rowMeans(noise_ratio_gene_by_donor) < .9
gene_name = rownames(counts)[keep_gene]
## The number of genes passed filtering with noise ratio.
sum(keep_gene)
#> [1] 961## Compute gene centrality.
cl <- makeCluster(detectCores())
registerDoParallel(cl)
centrality = foreach(i = 1:nrow(donor_info), .combine = 'c', .multicombine = T) %dopar% {
donor = donor_info$donor_id[i]
load(paste0(path, donor, '-coexpr.rda'))
Dozer::compute_centrality(res$network[keep_gene, keep_gene], threshold = .95)
}
stopCluster(cl)
## Since centrality is computed in parallel and combined into a single matrix, we separate each centrality mode into its own matrix.
centrality = data.frame(centrality, row.names = gene_name)
degree = centrality[, seq(1, ncol(centrality), 4)]
pagerank = centrality[, seq(2, ncol(centrality), 4)]
betweenness = centrality[, seq(3, ncol(centrality), 4)]
eigenvector = centrality[, seq(4, ncol(centrality), 4)]
## A function conducting tSNE dimension reduction for centrality matrices.
compute_tsne <- function(dat){
set.seed(1)
dat = log1p(dat)
pr <- prcomp(dat, scale = T, center = T)
tsne = Rtsne::Rtsne(pr$rotation[, 1:3], perplexity = 20)
res = data.frame(pr$rotation[, 1:5], tsne$Y)
colnames(res) = paste0('X', 1:ncol(res))
return(res)
}
## Compute tSNE reduction for each centrality mode.
df.tsne = rbind(data.frame(compute_tsne(degree), donor_info, method = 'Dozer', centrality = 'degree'),
data.frame(compute_tsne(pagerank), donor_info, method = 'Dozer', centrality = 'pagerank'),
data.frame(compute_tsne(betweenness), donor_info, method = 'Dozer', centrality = 'betweenness'),
data.frame(compute_tsne(eigenvector), donor_info, method = 'Dozer', centrality = 'eigenvector'))
print(apply(df.tsne[1:nrow(donor_info), 1:5], 2, FUN=function(x){cor(x, donor_info$diff_efficiency)}))
#> X1 X2 X3 X4 X5
#> 0.2440385 0.6391591 0.4534432 -0.1527256 0.1219386
## Plot tSNE coordinates of gene centrality with color labels for differentiation efficiency group.
ggplot(df.tsne, aes(X6, X7, color = phenotype)) + geom_point(size = 3) + facet_wrap( ~ centrality, nrow = 1, scales='free') +
labs(color = 'Phenotype') + xlab('tSNE_1') + ylab('tSNE_2')
## Compute silhouette score for the separation of gene centrality between the two differentiation efficiency group.
df.tsne %>% group_by(centrality) %>%
group_modify(~ {
data.frame( silhouette_score =
mean(silhouette(as.integer(as.factor(.x$phenotype)), daisy(.x[,1:3]))[,3]))}) %>%
kable(format = "html")| centrality | silhouette_score |
|---|---|
| betweenness | 0.3315516 |
| degree | 0.3261288 |
| eigenvector | 0.2171806 |
| pagerank | 0.2160431 |
centrality_label = colnames(centrality)[1:4]
DC.list = list()
for (j in 1:4){
df.centrality = log1p(centrality[, seq(j, ncol(centrality), 4)])
group = donor_info[, 'phenotype']
design2 <- model.matrix( ~ group - 1)
colnames(design2) <- c("Success", "Failure")
fit2 <- lmFit(df.centrality, design2)
contrast.matrix <- makeContrasts("Success-Failure", levels = design2)
fit2C <- contrasts.fit(fit2, contrast.matrix)
fit2C <- eBayes(fit2C)
tab = topTable(fit2C, number = nrow(df.centrality))[gene_name,]
DC.list[[j]] = data.frame(tab, centrality = centrality_label[j])
}
DC_test = bind_rows(DC.list, .id = "column_label")
DC_test$Significant = DC_test$adj.P.Val<.05
DC_test %>% ggplot(aes(logFC, -log10(adj.P.Val), color = Significant))+geom_point()+
facet_wrap(~centrality, scales = 'free')+ylab('-log10 adjusted p value')+scale_color_manual(values = c('grey50', 'red')) + ggtitle(' Differential centrality genes between success and failure in neuronal differentiation')# centrality mode
mode = 'degree'
# number of enriched terms shown in barplot
nterm = 15
# database
dbs <- "KEGG_2021_Human"
gene_set = rownames(DC_test%>% filter(centrality == mode & adj.P.Val < .05))
if (length(gene_set)>0){
enriched <- enrichr(databases = dbs, genes = as.character(gene_set))
nterm = min(nterm, nrow(enriched[[1]]))
enriched[[1]][1:nterm,] %>%mutate(Term = factor(Term, levels = Term[seq(nterm,1)]),
overlap_gene_count = unlist(lapply(Overlap, FUN=function(x){as.integer(strsplit(x, '/')[[1]][1])}))) %>%
ggplot(aes(Term, -log10(Adjusted.P.value), fill = overlap_gene_count))+geom_bar(stat = 'identity')+
scale_fill_gradient(low='blue', high = 'red')+coord_flip()+labs(fill = 'Overlapping gene counts')+
ylab('-log10 adjusted p value') + ggtitle(paste0('Gene set enrichment for differential ', mode, ' genes'))
}
#> Uploading data to Enrichr... Done.
#> Querying KEGG_2021_Human... Done.
#> Parsing results... Done.## mark the genes in Term "Pathways of neurodegeneration" on the scatter plot
labels = strsplit((head(enriched[[1]] %>% filter(Term == 'Pathways of neurodegeneration')))$Genes, ';')[[1]]
scatter.plot.df = data.frame(centrality = (DC_test %>% filter(centrality==mode))$adj.P.Val,
expression = DE_genes[rownames(DC_test %>% filter(centrality==mode)),'p_val_adj']) %>%
mutate(expression = pmax(expression, 1e-300),
signif_group = unlist(lapply((centrality <.05) + 2*(expression<.05),
FUN=function(x){
if(x==0){
return('Not significant')
}else if(x==1){
return('Significant in centrality')
}else if(x==2){
return('Significant in expression')
}else{
return('Significant in both')
}})))
rownames(scatter.plot.df) = rownames(DC_test %>% filter(centrality==mode))
scatter.plot.df %>%
ggplot(aes(expression, centrality, color = signif_group))+geom_point()+
scale_x_log10()+scale_y_log10()+
geom_text_repel(
data = scatter.plot.df[labels,],
aes(label = labels),
fontface='bold',
size = 3,
box.padding = unit(1, "lines"),
point.padding = unit(0.3, "lines"),
max.overlaps = Inf, show.legend = F)+
xlab('Adjusted p value \n differential expression')+ylab(paste0('Adjusted p value \n differential ', mode))+
theme(legend.position='top')+ guides(color = guide_legend(nrow = 2, byrow = TRUE, override.aes = list(size=3)))+
labs(color = 'Statistical significance') +scale_color_manual(values = c('grey50', 'steelblue', 'darkred', 'orange'))## Compute average networks in each donor group
ngene = sum(keep_gene)
n_success = sum(donor_info$phenotype=='Success')
n_failure = sum(donor_info$phenotype=='Failure')
network_success = matrix(0, nrow = ngene, ncol = ngene, dimnames = list(gene_name, gene_name) )
network_failure = matrix(0, nrow = ngene, ncol = ngene, dimnames = list(gene_name, gene_name) )
for(i in 1:nrow(donor_info)){
donor = donor_info$donor_id[i]
load(paste0(path, donor, '-coexpr.rda'))
# hard-thresholding
network_i = abs(res$network[keep_gene, keep_gene])
q = quantile(network_i[upper.tri(network_i)], .95)
network_i[network_i<q] = 0
network_i[network_i>0] = 1
if (donor_info$phenotype[i] == 'Success'){
network_success = network_success + network_i/n_success
}else{
network_failure = network_failure + network_i/n_failure
}
}
## Conduct hierarchical clustering on the difference of the two average networks.
clust = clustering_difference_network(network_success, network_failure, minClusterSize = 30)
#> ..cutHeight not given, setting it to 22.5 ===> 99% of the (truncated) height range in dendro.
#> ..done.
category = sort(unique(clust))
density = data.frame(matrix(0, nrow=nrow(donor_info), ncol = length(category)))
colnames(density) = paste0('Cluster_', category)
## Compute module density of in each donor specific network and test if there is significant difference in module densities between groups.
df.density = list()
row.names(donor_info) = NULL
for(i in 1:nrow(donor_info)){
donor = donor_info$donor_id[i]
load(paste0(path, donor, '-coexpr.rda'))
network_i = abs(res$network[keep_gene, keep_gene])
df.density[[i]] =
bind_rows(lapply(category, FUN=function(x){
data.frame(donor_info[i, ], cluster = paste0('Cluster ', x),
module_density = mean(network_i[clust==x, clust==x]))}))
}
## Visualize module densities.
df.density %>% bind_rows() %>%
ggplot(aes(phenotype, module_density, color = phenotype))+geom_boxplot(width=.3)+
stat_compare_means()+facet_wrap(cluster~., nrow=2) + labs(color = 'Phenotype')term.list = list()
for (i in 1:length(category)){
enriched <- enrichr(databases = dbs, genes = names(clust)[clust==category[i]])[[1]]
enriched$cluster = paste0('Cluster_',category[i])
if (sum(enriched$Adjusted.P.value<.05)>0){
term.list[[i]] = enriched[1:min(5, sum(enriched$Adjusted.P.value<.05)),]
}
}
#> Uploading data to Enrichr... Done.
#> Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#> Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#> Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#> Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#> Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#> Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
df.kegg <- bind_rows(term.list, .id = "column_label")
terms = df.kegg$Term[!duplicated(df.kegg$Term)]
term_gene = lapply(terms, FUN=function(x){unique(unlist(lapply(df.kegg$Genes[df.kegg$Term==x], FUN=function(y){strsplit(y, ';')[[1]]})))})
nterm = length(term_gene)
similarity_mat = matrix(1, nrow=nterm, ncol = nterm)
for(i in 1:(nterm-1)){
for (j in (i+1):nterm){
similarity_mat[i, j] = length(intersect(term_gene[[i]], term_gene[[j]]))/length(unique(c(term_gene[[i]], term_gene[[j]])))
similarity_mat[j, i] = similarity_mat[i,j]
}
}
orders =terms[hclust(as.dist(1-similarity_mat))$order]
df.kegg %>%
mutate( overlap_gene_count = unlist(lapply(Overlap, FUN=function(x){as.integer(strsplit(x, '/')[[1]][1])})), Term = factor(Term, levels = orders)) %>%
ggplot(aes(Term, -log10(Adjusted.P.value), fill = overlap_gene_count))+geom_bar(stat = 'identity')+
scale_fill_gradient(low='blue', high = 'red', trans='log10')+coord_flip()+labs(fill = 'Overlapping gene counts')+
ylab('-log10 adjusted p value') +facet_grid(~cluster, scales='free')+theme(legend.position = 'top')If expression of each individual gene grows proportionally with trimmed total UMI counts, using one cell size estimation for the normalization of all genes is adequate, otherwise, estimating cell size separately for each gene is preferable. We displayed the slope of linear regression where represent raw counts of gene and trimmed total UMI counts, and represent their mean. If gene expression grows proportionally with trimmed total UMI counts, regression slopes for all genes will be centered around one.
# Use data from one donor as an example
donor = donor_info$donor_id[2]
data = counts[, metadata$donor_id %in% donor]
normalize_with_global_cell_size = Dozer::diagnoistic_plot_cell_size(data, n=20)
nromalize_with_gene_specific_cell_size = Dozer::diagnoistic_plot_cell_size(data, n=20, gene_group_quantile = c(.5, .8))
normalize_with_global_cell_size[[1]]normalize_with_global_cell_size[[2]] + ggtitle('Normalize with global cell size')nromalize_with_gene_specific_cell_size[[2]] + ggtitle('Normalize with gene specifc cell size')# delete the folder storing co-expression matrices
unlink(path, recursive = TRUE)sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.1 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] parallel stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] Rtsne_0.16 enrichR_3.1 limma_3.50.3 cluster_2.1.2
#> [5] doParallel_1.0.17 iterators_1.0.14 foreach_1.5.2 knitr_1.40
#> [9] dplyr_1.0.10 ggpubr_0.4.0 ggrepel_0.9.1 ggplot2_3.3.6
#> [13] Matrix_1.5-1 Dozer_0.0.0.9000
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.9 lattice_0.20-45 tidyr_1.2.1
#> [4] assertthat_0.2.1 digest_0.6.29 utf8_1.2.2
#> [7] R6_2.5.1 backports_1.4.1 dynamicTreeCut_1.63-1
#> [10] evaluate_0.16 highr_0.9 httr_1.4.4
#> [13] pillar_1.8.1 rlang_1.0.6 curl_4.3.2
#> [16] car_3.1-0 jquerylib_0.1.4 rmarkdown_2.16
#> [19] labeling_0.4.2 stringr_1.4.1 munsell_0.5.0
#> [22] broom_1.0.1 compiler_4.1.2 xfun_0.33
#> [25] pkgconfig_2.0.3 htmltools_0.5.3 tidyselect_1.1.2
#> [28] tibble_3.1.8 codetools_0.2-18 viridisLite_0.4.1
#> [31] fansi_1.0.3 withr_2.5.0 grid_4.1.2
#> [34] jsonlite_1.8.0 gtable_0.3.1 lifecycle_1.0.2
#> [37] DBI_1.1.3 magrittr_2.0.3 scales_1.2.1
#> [40] cli_3.4.0 stringi_1.7.8 cachem_1.0.6
#> [43] carData_3.0-5 farver_2.1.1 ggsignif_0.6.3
#> [46] bslib_0.4.0 ellipsis_0.3.2 generics_0.1.3
#> [49] vctrs_0.4.1 rjson_0.2.21 tools_4.1.2
#> [52] glue_1.6.2 purrr_0.3.4 abind_1.4-5
#> [55] fastmap_1.1.0 yaml_2.3.5 colorspace_2.0-3
#> [58] rstatix_0.7.0 sass_0.4.2