Seurat’s standard integration pipeline for Study 2 data. In this pipeline Reciprocal PCA (RPCA) is used to identify anchors.

library(Seurat)
library(dplyr)
library(ggplot2)

Load datasets for Study 2

#Create Expression data list
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._metaData.RData")

Create Seurat objects and take log transformed TPM data as input

Poonia_et_al._obj <- CreateSeuratObject(counts = log2(Poonia_et_al._TPMData+1), project = "Poonia_et_al.",meta.data = Poonia_et_al._metaData)
Ebright_et_al._obj <- CreateSeuratObject(counts = log2(Ebright_et_al._TPMData+1), project = "Ebright_et_al.",meta.data = Ebright_et_al._metaData)
Ding_et_al._WBC1_obj <- CreateSeuratObject(counts = log2(Ding_et_al._WBC1_TPMData+1), project = "Ding_et_al._WBC1",meta.data = Ding_et_al._WBC1_metaData)
Ding_et_al._WBC2_obj <- CreateSeuratObject(counts = log2(Ding_et_al._WBC2_TPMData+1), project = "Ding_et_al._WBC2",meta.data = Ding_et_al._WBC2_metaData)

Seurat objects list

seurat_objs = list(Poonia_et_al._obj,
                   Ebright_et_al._obj,
                   Ding_et_al._WBC1_obj,
                   Ding_et_al._WBC2_obj)

Skip normalization step here as data is already length normalized

# Identify variable features for each dataset independently
seurat_objs.list <- lapply(X = seurat_objs, FUN = function(x) {
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = seurat_objs.list)
seurat_objs.list <- lapply(X = seurat_objs.list, FUN = function(x) {
  x <- ScaleData(x, features = features, verbose = FALSE)
  x <- RunPCA(x, features = features, verbose = FALSE)
})
## Warning in irlba(A = t(x = object), nv = npcs, ...): You're computing too large
## a percentage of total singular values, use a standard svd instead.

Here We used RPCA as reduction method

#Perform integration
Study2_rPCA.anchors <- FindIntegrationAnchors(object.list = seurat_objs.list, anchor.features = features,k.filter = 200,reduction = "rpca")
# this command creates an 'integrated' data assay
Study2_rPCA.combined <- IntegrateData(anchorset = Study2_rPCA.anchors,k.weight = 20)

# specify that we will perform downstream analysis on the corrected data note that the
# original unmodified data still resides in the 'RNA' assay
DefaultAssay(Study2_rPCA.combined) <- "integrated"
# Run the standard workflow for visualization and clustering
Study2_rPCA.combined <- ScaleData(Study2_rPCA.combined, verbose = FALSE)
Study2_rPCA.combined <- RunPCA(Study2_rPCA.combined, npcs = 30, verbose = FALSE)
Study2_rPCA.combined <- RunUMAP(Study2_rPCA.combined, reduction = "pca", dims = 1:30)
Study2_rPCA.combined <- FindNeighbors(Study2_rPCA.combined, reduction = "pca", dims = 1:30)
Study2_rPCA.combined <- FindClusters(Study2_rPCA.combined, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1648
## Number of edges: 46128
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9256
## Number of communities: 14
## Elapsed time: 0 seconds

Color key for Visualization

ColorKeyDataID = c("peru","steelblue","darkolivegreen4",
                   "palevioletred4","darkcyan","darkorchid4",
                   "darkslategray","firebrick1","salmon3",
                   "paleturquoise1","mediumaquamarine",
                   "greenyellow","black","deepskyblue3","mediumblue",
                   "darkred","gold","gray50","hotpink","khaki3",
                   "yellow4","lavender","cornsilk4","orchid4",
                   "yellow3", "darkgreen","skyblue1","khaki4",
                   "tan4","pink")

PCA

DimPlot(Study2_rPCA.combined, reduction = "pca",pt.size =1, group.by = "Class",label = FALSE)+scale_color_manual(values=ColorKeyDataID)

DimPlot(Study2_rPCA.combined, reduction = "pca",pt.size =1, group.by = "seurat_clusters",label = FALSE, repel = TRUE)+scale_color_manual(values=ColorKeyDataID)

UMAP

DimPlot(Study2_rPCA.combined, reduction = "umap",pt.size =1, group.by = "Class",label = FALSE)+scale_color_manual(values=ColorKeyDataID)

DimPlot(Study2_rPCA.combined, reduction = "umap",pt.size =1, group.by = "seurat_clusters",label = FALSE, repel = TRUE)+scale_color_manual(values=ColorKeyDataID)

Create a dataframe from seurat obj

seurat_umap  = data.frame(Study2_rPCA.combined@reductions$umap@cell.embeddings)
seurat_umap$Cell_type = Study2_rPCA.combined@meta.data$Cell_type
seurat_umap$Clusters = Study2_rPCA.combined@meta.data$seurat_clusters
colnames(seurat_umap) = c("UMAP1","UMAP2","Cell_type","Clusters")

barplot

ggplot(seurat_umap, aes(x=Clusters, fill = Cell_type))+ theme_classic()+geom_bar(stat="count")+scale_color_manual()+scale_fill_manual(values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))+theme(legend.text = element_text(size=14),plot.title = element_text(size=16),                                                                                    legend.title=element_text(size=20),axis.text=element_text(size=20),
       axis.title=element_text(size=22,face="bold"))+guides(colour = guide_legend(override.aes = list(size = 6)))

ARI, NMI and Cluster purity

aricode::ARI(seurat_umap$Clusters,seurat_umap$Cell_type)
## [1] 0.09160131
aricode::NMI(seurat_umap$Clusters,seurat_umap$Cell_type)
## [1] 0.1387896
ClusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(seurat_umap$Clusters,seurat_umap$Cell_type)
## [1] 0.8574029