Seurat’s standard integration pipeline for Study 1 data. In this pipeline canonical correlation analysis (CCA) is utilized to identify anchors

library(Seurat)
library(dplyr)
library(ggplot2)

Load datasets for Study 1

load("~/Supplemental_code/unCTC_datasets/Zheng_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Zheng_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Velten_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Velten_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Sarioglu_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Sarioglu_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Jordan_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Jordan_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Aceto_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Aceto_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Yu_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Yu_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Ting_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ting_et_al._Data.RData")

Create Seurat objects

Velten_et_al._Data_obj <- CreateSeuratObject(counts = Velten_et_al._Data, project = "Velten_et_al._Data",meta.data = Velten_et_al._metaData)
Ting_et_al._Data_obj <- CreateSeuratObject(counts = Ting_et_al._Data, project = "Ting_et_al._Data",meta.data = Ting_et_al._metaData)
Yu_et_al._Data_obj <- CreateSeuratObject(counts = Yu_et_al._Data, project = "Yu_et_al._Data",meta.data = Yu_et_al._metaData)
Sarioglu_et_al._Data_obj <- CreateSeuratObject(counts = Sarioglu_et_al._Data, project = "Sarioglu_et_al._Data",meta.data = Sarioglu_et_al._metaData)
Jordan_et_al._Data_obj <- CreateSeuratObject(counts = Jordan_et_al._Data, project = "Jordan_et_al._Data",meta.data = Jordan_et_al._metaData)
Aceto_et_al._Data_obj <- CreateSeuratObject(counts = Aceto_et_al._Data, project = "Aceto_et_al._Data",meta.data = Aceto_et_al._metaData)
Zheng_et_al._Data_obj <- CreateSeuratObject(counts = Zheng_et_al._Data, project = "Zheng_et_al._Data",meta.data = Zheng_et_al._metaData)

Seurat objects list

seurat_objs = list(Velten_et_al._Data_obj,Ting_et_al._Data_obj,
                   Yu_et_al._Data_obj,Sarioglu_et_al._Data_obj,
                   Jordan_et_al._Data_obj,Aceto_et_al._Data_obj,
                   Zheng_et_al._Data_obj)
# normalize and identify variable features for each dataset independently
seurat_objs.list <- lapply(X = seurat_objs, FUN = function(x) {
  x <- NormalizeData(x,verbose = T, normalization.method = "LogNormalize")
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = seurat_objs.list)

seurat_objs.list <- lapply(X = seurat_objs.list, FUN = function(x) {
  x <- ScaleData(x, features = features, verbose = FALSE)
})

Here We used CCA as reduction method

#Perform integration

Study1_CCA.anchors <- FindIntegrationAnchors(object.list = seurat_objs.list,k.filter = 200,
                                         k.anchor=3,dims = 1:5,k.score=5)
# this command creates an 'integrated' data assay
Study1_CCA.combined1 <- IntegrateData(anchorset = Study1_CCA.anchors,dims = 1:5,k.weight = 5)

# specify that we will perform downstream analysis on the corrected data note that the
# original unmodified data still resides in the 'RNA' assay
DefaultAssay(Study1_CCA.combined1) <- "integrated"
# Run the standard workflow for visualization and clustering
Study1_CCA.combined1 <- ScaleData(Study1_CCA.combined1, verbose = FALSE)
Study1_CCA.combined1 <- RunPCA(Study1_CCA.combined1, npcs = 50, verbose = TRUE)
Study1_CCA.combined1 <- RunUMAP(Study1_CCA.combined1, reduction = "pca", dims = 1:5)
Study1_CCA.combined1 <- FindNeighbors(Study1_CCA.combined1, reduction = "pca", dims = 1:30)
Study1_CCA.combined1 <- FindClusters(Study1_CCA.combined1, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1184
## Number of edges: 51431
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8002
## Number of communities: 7
## Elapsed time: 0 seconds

Colorkey for Visualization

ColorKeyDataID = c("coral4","darkcyan","steelblue","orangered",
                   "darkolivegreen4","lightsteelblue3",
                   "darkorchid4","darkslategray","salmon3",
                   "paleturquoise1","mediumaquamarine",
                   "greenyellow","black","deepskyblue3","mediumblue",
                   "peru","gold","gray50","hotpink","khaki3",
                   "yellow4","lavender","cornsilk4","orchid4",
                   "yellow3", "darkgreen","skyblue1","khaki4",
                   "tan4","firebrick1","pink")

PCA

DimPlot(Study1_CCA.combined1, reduction = "pca",pt.size =1, group.by = "DataID")+scale_color_manual(values=ColorKeyDataID)

DimPlot(Study1_CCA.combined1, reduction = "pca",pt.size =1, label = FALSE, repel = TRUE)+scale_color_manual(values=ColorKeyDataID)

UMAP

DimPlot(Study1_CCA.combined1, reduction = "umap",pt.size =1, group.by = "DataID")+scale_color_manual(values=ColorKeyDataID)

DimPlot(Study1_CCA.combined1, reduction = "umap",pt.size =1, label = FALSE, repel = TRUE)+scale_color_manual(values=ColorKeyDataID)

Create a dataframe from seurat obj

seurat_umap_example  = data.frame(Study1_CCA.combined1@reductions$umap@cell.embeddings)
seurat_umap_example$Class = Study1_CCA.combined1@meta.data$DataID
seurat_umap_example$Clusters = Study1_CCA.combined1@meta.data$seurat_clusters
seurat_umap_example$Cell_type = Study1_CCA.combined1@meta.data$Cell_type
colnames(seurat_umap_example) = c("UMAP1","UMAP2","Class","Clusters","Cell_type")

barplot

 ggplot(seurat_umap_example, aes(x=Clusters, fill = Cell_type))+ theme_classic()+geom_bar(stat="count")+scale_color_manual()+scale_fill_manual(values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))+theme(legend.text = element_text(size=14),plot.title = element_text(size=16),                                                                               legend.title=element_text(size=20),axis.text=element_text(size=20),
       axis.title=element_text(size=22,face="bold"))+guides(colour = guide_legend(override.aes = list(size = 6)))

ARI, NMI and Cluster purity

aricode::ARI(seurat_umap_example$Clusters,seurat_umap_example$Cell_type)
## [1] 0.01070219
aricode::NMI(seurat_umap_example$Clusters,seurat_umap_example$Cell_type)
## [1] 0.01099027
ClusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(seurat_umap_example$Clusters,seurat_umap_example$Cell_type)
## [1] 0.8758446