Vignette for Dozer: Debiased personalized gene co-expression networks for population-scale scRNA-seq data

suppressPackageStartupMessages({
  library(Dozer)
  library(Matrix)
  library(ggplot2)
  library(ggrepel)
  library(ggpubr)
  library(dplyr)
  library(knitr)
  library(foreach)
  library(doParallel)
  library(cluster)
  library(limma)
  library(enrichR)
  library(Rtsne)
})

load(system.file("extdata", "Jerber_demo.rda", package = "Dozer"))
theme_set(theme_pubr(base_size = 12))

1 Data description

## Barplot for the differentiation efficiency among donors.
ggplot(donor_info, aes(diff_efficiency, fill = phenotype))+
  geom_histogram()+labs(fill = 'Phenotype')+
  xlab('Differentiation efficiency')+ylab('Number of donors')


## Visualize all cells in a scatter plot with color labels for donors.
ggplot(metadata, aes(tSNE_1, tSNE_2, color = phenotype))+geom_point()+
  ggtitle('Scatter plot of cells')+theme(legend.position ='right')+labs(color = 'Phenotype')

2 Compute gene-gene correlation matrix and gene-specific noise ratio.


path = paste0(tempdir(), '/dozer_tmp')
if (! file.exists(path)){
  dir.create(path)
}
cl <- makeCluster(detectCores()) 
registerDoParallel(cl)
## Load noise ratio into a "gene by donor" matrix.
noise_ratio_gene_by_donor = foreach (i = 1:nrow(donor_info), .packages = c('Matrix'), .combine = 'cbind') %dopar%{
    donor = donor_info$donor_id[i]
    data = matrix(counts[ , metadata$donor_id == donor], nrow = nrow(counts))
    meta = metadata[metadata$donor_id == donor, ]
    
    ## If there are several sample_index presented in one dataset, regress it out.
    res = Dozer::compute_gene_correlation(data, 
                                   covs = data.frame(meta$sample_id), 
                                   gene_group_quantile = c(.5, .8))
    ## Saver co-expression matrices to file
    save(res, file = paste0(path, donor, '-coexpr.rda'))
    res$ratio[,1]
}
stopCluster(cl)

2.1 Filter genes by their noise ratio.


## Keep genes whose average noise ratio smaller than 0.9.
keep_gene = rowMeans(noise_ratio_gene_by_donor) < .9
gene_name = rownames(counts)[keep_gene]
## The number of genes passed filtering with noise ratio.
sum(keep_gene)
#> [1] 961

3 Gene centrality analysis

3.1 Compute and visualize gene centrality

## Compute gene centrality.
cl <- makeCluster(detectCores()) 
registerDoParallel(cl)
centrality = foreach(i = 1:nrow(donor_info), .combine = 'c', .multicombine = T) %dopar% {
  donor = donor_info$donor_id[i]
  load(paste0(path, donor, '-coexpr.rda'))
  Dozer::compute_centrality(res$network[keep_gene, keep_gene], threshold = .95)
}
stopCluster(cl)
## Since centrality is computed in parallel and combined into a single matrix, we separate each centrality mode into its own matrix.
centrality = data.frame(centrality, row.names = gene_name)
degree = centrality[, seq(1, ncol(centrality), 4)]
pagerank = centrality[, seq(2, ncol(centrality), 4)]
betweenness = centrality[, seq(3, ncol(centrality), 4)]
eigenvector = centrality[, seq(4, ncol(centrality), 4)]

## A function conducting tSNE dimension reduction for centrality matrices.
compute_tsne <- function(dat){
  set.seed(1)
  dat = log1p(dat)
  pr <- prcomp(dat, scale = T, center = T)
  tsne = Rtsne::Rtsne(pr$rotation[, 1:3], perplexity = 20)
  res = data.frame(pr$rotation[, 1:5], tsne$Y)
  colnames(res) = paste0('X', 1:ncol(res))
  return(res)
}

## Compute tSNE reduction for each centrality mode.
df.tsne = rbind(data.frame(compute_tsne(degree), donor_info, method = 'Dozer', centrality = 'degree'),
               data.frame(compute_tsne(pagerank), donor_info, method = 'Dozer', centrality = 'pagerank'),
               data.frame(compute_tsne(betweenness), donor_info, method = 'Dozer', centrality = 'betweenness'),
               data.frame(compute_tsne(eigenvector), donor_info, method = 'Dozer', centrality = 'eigenvector'))
print(apply(df.tsne[1:nrow(donor_info), 1:5], 2, FUN=function(x){cor(x, donor_info$diff_efficiency)}))
#>         X1         X2         X3         X4         X5 
#>  0.2440385  0.6391591  0.4534432 -0.1527256  0.1219386


## Plot tSNE coordinates of gene centrality with color labels for differentiation efficiency group.
ggplot(df.tsne, aes(X6, X7, color = phenotype)) + geom_point(size = 3) + facet_wrap( ~ centrality, nrow = 1, scales='free') +
  labs(color = 'Phenotype') + xlab('tSNE_1') + ylab('tSNE_2')


## Compute silhouette score for the separation of gene centrality between the two differentiation efficiency group.

df.tsne %>% group_by(centrality) %>%
  group_modify(~ {
     data.frame( silhouette_score = 
      mean(silhouette(as.integer(as.factor(.x$phenotype)), daisy(.x[,1:3]))[,3]))}) %>% 
    kable(format = "html")
centrality silhouette_score
betweenness 0.3315516
degree 0.3261288
eigenvector 0.2171806
pagerank 0.2160431

3.2 Compute differential centrality genes from Dozer co-expression network and display on a volcano plot.

centrality_label = colnames(centrality)[1:4]
DC.list = list()
for (j in 1:4){
  df.centrality = log1p(centrality[, seq(j, ncol(centrality), 4)])
  group = donor_info[, 'phenotype']
  design2 <- model.matrix( ~ group - 1)
  colnames(design2) <- c("Success", "Failure")
  fit2 <- lmFit(df.centrality, design2)
  contrast.matrix <- makeContrasts("Success-Failure", levels = design2)
  fit2C <- contrasts.fit(fit2, contrast.matrix)
  fit2C <- eBayes(fit2C)
  tab = topTable(fit2C, number = nrow(df.centrality))[gene_name,]
  DC.list[[j]] = data.frame(tab, centrality = centrality_label[j])
}
DC_test = bind_rows(DC.list, .id = "column_label") 
DC_test$Significant = DC_test$adj.P.Val<.05
DC_test %>% ggplot(aes(logFC, -log10(adj.P.Val), color = Significant))+geom_point()+
  facet_wrap(~centrality, scales = 'free')+ylab('-log10 adjusted p value')+scale_color_manual(values = c('grey50', 'red')) + ggtitle(' Differential centrality genes between success and failure in neuronal differentiation')

3.3 Gene set enrichment for differential centrality genes.

# centrality mode
mode = 'degree'
# number of enriched terms shown in barplot 
nterm = 15  
# database
dbs <- "KEGG_2021_Human"

gene_set = rownames(DC_test%>% filter(centrality == mode & adj.P.Val < .05))

if (length(gene_set)>0){
  enriched <- enrichr(databases = dbs, genes = as.character(gene_set))
  nterm = min(nterm, nrow(enriched[[1]]))
  enriched[[1]][1:nterm,] %>%mutate(Term = factor(Term, levels = Term[seq(nterm,1)]), 
    overlap_gene_count = unlist(lapply(Overlap, FUN=function(x){as.integer(strsplit(x, '/')[[1]][1])}))) %>% 
    ggplot(aes(Term, -log10(Adjusted.P.value), fill = overlap_gene_count))+geom_bar(stat = 'identity')+
    scale_fill_gradient(low='blue', high = 'red')+coord_flip()+labs(fill = 'Overlapping gene counts')+
    ylab('-log10 adjusted p value') + ggtitle(paste0('Gene set enrichment for differential ', mode, ' genes'))
}
#> Uploading data to Enrichr... Done.
#>   Querying KEGG_2021_Human... Done.
#> Parsing results... Done.

3.4 Compare adjusted p-values from differential expression and differential degree in a scatter plot.

## mark the genes in Term "Pathways of neurodegeneration" on the scatter plot
labels = strsplit((head(enriched[[1]] %>% filter(Term == 'Pathways of neurodegeneration')))$Genes, ';')[[1]]
scatter.plot.df = data.frame(centrality = (DC_test %>% filter(centrality==mode))$adj.P.Val,
        expression = DE_genes[rownames(DC_test %>% filter(centrality==mode)),'p_val_adj']) %>%
        mutate(expression = pmax(expression, 1e-300), 
               signif_group = unlist(lapply((centrality <.05) + 2*(expression<.05), 
                  FUN=function(x){
                  if(x==0){
                    return('Not significant')
                  }else if(x==1){
                    return('Significant in centrality')
                  }else if(x==2){
                    return('Significant in expression')
                  }else{
                    return('Significant in both')
                  }})))
rownames(scatter.plot.df) = rownames(DC_test %>% filter(centrality==mode))
scatter.plot.df %>% 
  ggplot(aes(expression, centrality, color = signif_group))+geom_point()+
  scale_x_log10()+scale_y_log10()+
    geom_text_repel(
    data = scatter.plot.df[labels,],
    aes(label = labels),
    fontface='bold',
    size = 3,
    box.padding = unit(1, "lines"),
    point.padding = unit(0.3, "lines"),
    max.overlaps = Inf, show.legend = F)+
  xlab('Adjusted p value \n differential expression')+ylab(paste0('Adjusted p value \n differential ', mode))+
theme(legend.position='top')+  guides(color = guide_legend(nrow = 2, byrow = TRUE, override.aes = list(size=3)))+
  labs(color = 'Statistical significance') +scale_color_manual(values = c('grey50', 'steelblue', 'darkred', 'orange'))

4 “Difference network” between phenotypic groups

4.1 Detect modules in the “differene network” between Success and Failure group.

## Compute average networks in each donor group
ngene = sum(keep_gene)
n_success = sum(donor_info$phenotype=='Success')
n_failure = sum(donor_info$phenotype=='Failure')

network_success = matrix(0, nrow = ngene, ncol = ngene, dimnames = list(gene_name, gene_name) )
network_failure = matrix(0, nrow = ngene, ncol = ngene, dimnames = list(gene_name, gene_name) )

for(i in 1:nrow(donor_info)){
  donor = donor_info$donor_id[i]
  load(paste0(path, donor, '-coexpr.rda'))
  # hard-thresholding
  network_i = abs(res$network[keep_gene, keep_gene])
  q = quantile(network_i[upper.tri(network_i)], .95)
  network_i[network_i<q] = 0
  network_i[network_i>0] = 1
  
  if (donor_info$phenotype[i] == 'Success'){
    network_success = network_success + network_i/n_success
  }else{
    network_failure = network_failure + network_i/n_failure
  }
}
## Conduct hierarchical clustering on the difference of the two average networks.
clust = clustering_difference_network(network_success, network_failure, minClusterSize = 30)
#>  ..cutHeight not given, setting it to 22.5  ===>  99% of the (truncated) height range in dendro.
#>  ..done.
category = sort(unique(clust))
density  = data.frame(matrix(0, nrow=nrow(donor_info), ncol = length(category)))
colnames(density) = paste0('Cluster_', category)
## Compute module density of in each donor specific network and test if there is significant difference in module densities between groups.
df.density = list()
row.names(donor_info) = NULL
for(i in 1:nrow(donor_info)){
  donor = donor_info$donor_id[i]
  load(paste0(path, donor, '-coexpr.rda'))
  network_i = abs(res$network[keep_gene, keep_gene])
  df.density[[i]] =
       bind_rows(lapply(category, FUN=function(x){
         data.frame(donor_info[i, ], cluster = paste0('Cluster ', x), 
                    module_density = mean(network_i[clust==x, clust==x]))}))
}

## Visualize module densities.
df.density %>% bind_rows() %>%
  ggplot(aes(phenotype, module_density, color = phenotype))+geom_boxplot(width=.3)+
    stat_compare_means()+facet_wrap(cluster~., nrow=2) + labs(color = 'Phenotype')

4.2 Gene set enrichment for modules from difference network.

term.list = list()
for (i in 1:length(category)){
  enriched <- enrichr(databases = dbs, genes = names(clust)[clust==category[i]])[[1]]
  enriched$cluster = paste0('Cluster_',category[i])
  if (sum(enriched$Adjusted.P.value<.05)>0){
    term.list[[i]] = enriched[1:min(5, sum(enriched$Adjusted.P.value<.05)),]  
  }
}
#> Uploading data to Enrichr... Done.
#>   Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#>   Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#>   Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#>   Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#>   Querying KEGG_2021_Human... Done.
#> Parsing results... Done.
#> Uploading data to Enrichr... Done.
#>   Querying KEGG_2021_Human... Done.
#> Parsing results... Done.

df.kegg <- bind_rows(term.list, .id = "column_label") 
terms = df.kegg$Term[!duplicated(df.kegg$Term)]
term_gene = lapply(terms, FUN=function(x){unique(unlist(lapply(df.kegg$Genes[df.kegg$Term==x], FUN=function(y){strsplit(y, ';')[[1]]})))})
nterm = length(term_gene)
similarity_mat = matrix(1, nrow=nterm, ncol = nterm)
for(i in 1:(nterm-1)){
  for (j in (i+1):nterm){
    similarity_mat[i, j] = length(intersect(term_gene[[i]], term_gene[[j]]))/length(unique(c(term_gene[[i]], term_gene[[j]])))
    similarity_mat[j, i] = similarity_mat[i,j]
  }
}
orders =terms[hclust(as.dist(1-similarity_mat))$order]
df.kegg %>%  
   mutate( overlap_gene_count = unlist(lapply(Overlap, FUN=function(x){as.integer(strsplit(x, '/')[[1]][1])})), Term = factor(Term, levels = orders)) %>% 
   ggplot(aes(Term, -log10(Adjusted.P.value), fill = overlap_gene_count))+geom_bar(stat = 'identity')+
  scale_fill_gradient(low='blue', high = 'red', trans='log10')+coord_flip()+labs(fill = 'Overlapping gene counts')+
  ylab('-log10 adjusted p value') +facet_grid(~cluster, scales='free')+theme(legend.position = 'top')

5 Diagnostic plot for the adequacy of a global cell size estimation for the normalization of all genes.

If expression of each individual gene grows proportionally with trimmed total UMI counts, using one cell size estimation for the normalization of all genes is adequate, otherwise, estimating cell size separately for each gene is preferable. We displayed the slope of linear regression YjY¯jll¯ where Yj,l represent raw counts of gene j and trimmed total UMI counts, and Y¯j,l¯ represent their mean. If gene expression grows proportionally with trimmed total UMI counts, regression slopes for all genes will be centered around one.

# Use data from one donor as an example
donor = donor_info$donor_id[2]
data = counts[, metadata$donor_id %in% donor]
normalize_with_global_cell_size = Dozer::diagnoistic_plot_cell_size(data, n=20)
nromalize_with_gene_specific_cell_size = Dozer::diagnoistic_plot_cell_size(data, n=20, gene_group_quantile = c(.5, .8))

normalize_with_global_cell_size[[1]]

normalize_with_global_cell_size[[2]] + ggtitle('Normalize with global cell size')

nromalize_with_gene_specific_cell_size[[2]] + ggtitle('Normalize with gene specifc cell size')

# delete the folder storing co-expression matrices
unlink(path, recursive = TRUE)
sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.1 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] parallel  stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] Rtsne_0.16        enrichR_3.1       limma_3.50.3      cluster_2.1.2    
#>  [5] doParallel_1.0.17 iterators_1.0.14  foreach_1.5.2     knitr_1.40       
#>  [9] dplyr_1.0.10      ggpubr_0.4.0      ggrepel_0.9.1     ggplot2_3.3.6    
#> [13] Matrix_1.5-1      Dozer_0.0.0.9000 
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.9            lattice_0.20-45       tidyr_1.2.1          
#>  [4] assertthat_0.2.1      digest_0.6.29         utf8_1.2.2           
#>  [7] R6_2.5.1              backports_1.4.1       dynamicTreeCut_1.63-1
#> [10] evaluate_0.16         highr_0.9             httr_1.4.4           
#> [13] pillar_1.8.1          rlang_1.0.6           curl_4.3.2           
#> [16] car_3.1-0             jquerylib_0.1.4       rmarkdown_2.16       
#> [19] labeling_0.4.2        stringr_1.4.1         munsell_0.5.0        
#> [22] broom_1.0.1           compiler_4.1.2        xfun_0.33            
#> [25] pkgconfig_2.0.3       htmltools_0.5.3       tidyselect_1.1.2     
#> [28] tibble_3.1.8          codetools_0.2-18      viridisLite_0.4.1    
#> [31] fansi_1.0.3           withr_2.5.0           grid_4.1.2           
#> [34] jsonlite_1.8.0        gtable_0.3.1          lifecycle_1.0.2      
#> [37] DBI_1.1.3             magrittr_2.0.3        scales_1.2.1         
#> [40] cli_3.4.0             stringi_1.7.8         cachem_1.0.6         
#> [43] carData_3.0-5         farver_2.1.1          ggsignif_0.6.3       
#> [46] bslib_0.4.0           ellipsis_0.3.2        generics_0.1.3       
#> [49] vctrs_0.4.1           rjson_0.2.21          tools_4.1.2          
#> [52] glue_1.6.2            purrr_0.3.4           abind_1.4-5          
#> [55] fastmap_1.1.0         yaml_2.3.5            colorspace_2.0-3     
#> [58] rstatix_0.7.0         sass_0.4.2