Seurat pipeline for Study 1 data. We combined all datasets of Study 1 into single matrix on the basis of common genes.

source("~/Supplemental_code/Data_integration.R")
library(Seurat)
library(ggplot2)

Load datasets for Study 1

load("~/Supplemental_code/unCTC_datasets/Zheng_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Zheng_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Velten_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Velten_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Sarioglu_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Sarioglu_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Jordan_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Jordan_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Aceto_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Aceto_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Yu_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Yu_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Ting_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ting_et_al._Data.RData")

Data integration based on common genes

Data2 = Data_integration(data_list=list(Velten_et_al._Data,
                                      Ting_et_al._Data,
                                      Yu_et_al._Data,
                                      Sarioglu_et_al._Data,
                                      Jordan_et_al._Data,
                                      Aceto_et_al._Data,
                                      Zheng_et_al._Data))

data2_metadata = rbind(Velten_et_al._metaData,
                       Ting_et_al._metaData,
                       Yu_et_al._metaData,
                       Sarioglu_et_al._metaData,
                       Jordan_et_al._metaData,
                       Aceto_et_al._metaData,
                       Zheng_et_al._metaData)
Seurat_obj_study1 <- CreateSeuratObject(counts = Data2, project = "Study_1_data",meta.data = data2_metadata)

normalize and identify variable features for each dataset independently

Seurat_obj_study1 <- NormalizeData(Seurat_obj_study1, normalization.method = "LogNormalize", scale.factor = 10000)

#Identification of highly variable features (feature selection)
Seurat_obj_study1 <- FindVariableFeatures(Seurat_obj_study1, selection.method = "vst", nfeatures = 2000)

#Scaling
all.genes <- rownames(Seurat_obj_study1)
Seurat_obj_study1 <- ScaleData(Seurat_obj_study1, features = all.genes)

#perform linear dimensional reduction
Seurat_obj_study1 <- RunPCA(Seurat_obj_study1, features = VariableFeatures(object = Seurat_obj_study1))

#Perform clustering 
Seurat_obj_study1 <- FindNeighbors(Seurat_obj_study1, dims = 1:10)
Seurat_obj_study1 <- FindClusters(Seurat_obj_study1, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1184
## Number of edges: 38745
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8262
## Number of communities: 7
## Elapsed time: 0 seconds
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
Seurat_obj_study1 <- RunUMAP(Seurat_obj_study1, dims = 1:10)

Colorkey for Visualization

ColorKeyDataID = c("coral4","darkcyan","steelblue","orangered",
                   "darkolivegreen4","lightsteelblue3",
                   "darkorchid4","darkslategray","salmon3",
                   "paleturquoise1","mediumaquamarine",
                   "greenyellow","black","deepskyblue3",
                   "mediumblue","peru","gold","gray50",
                   "hotpink","khaki3","yellow4","lavender",
                   "cornsilk4","orchid4","yellow3", 
                   "darkgreen","skyblue1","khaki4",
                   "tan4","firebrick1","pink")

PCA

DimPlot(Seurat_obj_study1, reduction = "pca",pt.size =1, group.by = "DataID")+scale_color_manual(values=ColorKeyDataID)

DimPlot(Seurat_obj_study1, reduction = "pca",pt.size =1, label = FALSE, repel = TRUE)+scale_color_manual(values=ColorKeyDataID)

UMAP

DimPlot(Seurat_obj_study1, reduction = "umap",pt.size =1, group.by = "DataID")+scale_color_manual(values=ColorKeyDataID)

DimPlot(Seurat_obj_study1, reduction = "umap",pt.size =1, label = FALSE, repel = TRUE)+scale_color_manual(values=ColorKeyDataID)

Create a data frame for barplot

seurat_umap_study1  = data.frame(Seurat_obj_study1@reductions$umap@cell.embeddings)

seurat_umap_study1$Clusters = Seurat_obj_study1@meta.data$seurat_clusters

seurat_umap_study1$DataID = Seurat_obj_study1@meta.data$DataID
seurat_umap_study1$Class = Seurat_obj_study1@meta.data$Cell_type
colnames(seurat_umap_study1) = c("UMAP1","UMAP2","Clusters","DataID","Class")

barplot

ggplot(seurat_umap_study1, aes(x=Clusters, fill = Class)) + theme_classic()+
  geom_bar(stat="count")+scale_color_manual()+
  scale_fill_manual(values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))+
  theme(legend.text = element_text(size=14),
        plot.title = element_text(size=16),
        legend.title=element_text(size=20),axis.text=element_text(size=20),
        axis.title=element_text(size=22,face="bold")) +
  guides(colour = guide_legend(override.aes = list(size = 6)))

ARI, NMI and Cluster purity

aricode::ARI(seurat_umap_study1$Clusters,seurat_umap_study1$Class)
## [1] 0.1069208
aricode::NMI(seurat_umap_study1$Clusters,seurat_umap_study1$Class)
## [1] 0.1859378
ClusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(seurat_umap_study1$Clusters,seurat_umap_study1$Class)
## [1] 0.9932432