Seurat pipeline for Study 2 TPM data. We combined all datasets of
Study 2 into single matrix on the basis of common genes.
source("~/Supplemental_code/Data_integration.R")
library(Seurat)
library(ggplot2)
Load datasets for Study 2
#Create Expression data list
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._metaData.RData")
Data integration based on common genes
Stydy2Data = Data_integration(data_list =list(Poonia_et_al._TPMData,Ding_et_al._WBC1_TPMData,
Ebright_et_al._TPMData,Ding_et_al._WBC2_TPMData))
Stydy2Datametadata = rbind(Poonia_et_al._metaData,Ding_et_al._WBC1_metaData,
Ebright_et_al._metaData,Ding_et_al._WBC2_metaData)
Seurat_obj_study2 <- CreateSeuratObject(counts = log2(Stydy2Data+1), project = "Study2_Data",meta.data = Stydy2Datametadata)
# Skipping normalization step as we are taking log transformed length normalized data (TPM)
#Seurat_obj_study2 <- NormalizeData(Seurat_obj_study2, normalization.method = "LogNormalize", scale.factor = 10000)
#Identification of highly variable features (feature selection)
Seurat_obj_study2 <- FindVariableFeatures(Seurat_obj_study2, selection.method = "vst", nfeatures = 2000)
#Scaling
all.genes <- rownames(Seurat_obj_study2)
Seurat_obj_study2 <- ScaleData(Seurat_obj_study2, features = all.genes)
#perform linear dimensional reduction
Seurat_obj_study2 <- RunPCA(Seurat_obj_study2, features = VariableFeatures(object = Seurat_obj_study2))
#Perform clustering
Seurat_obj_study2 <- FindNeighbors(Seurat_obj_study2, dims = 1:10)
Seurat_obj_study2 <- FindClusters(Seurat_obj_study2, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1648
## Number of edges: 43454
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9277
## Number of communities: 14
## Elapsed time: 0 seconds
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
Seurat_obj_study2 <- RunUMAP(Seurat_obj_study2, dims = 1:10)
Colorkey for Visualization
# Visualization
ColorKeyDataID = c("peru","steelblue","darkolivegreen4","palevioletred4",
"darkcyan","darkorchid4","darkslategray","firebrick1",
"salmon3","paleturquoise1","mediumaquamarine",
"greenyellow","black","deepskyblue3","mediumblue",
"darkred","gold","gray50","hotpink","khaki3",
"yellow4","lavender","cornsilk4","orchid4",
"yellow3", "darkgreen","skyblue1","khaki4",
"tan4","pink")
PCA
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(Seurat_obj_study2, reduction = "pca",pt.size =1, group.by = "Class")+scale_color_manual(values=ColorKeyDataID)

DimPlot(Seurat_obj_study2, reduction = "pca",pt.size =1, label = FALSE, repel = TRUE)+scale_color_manual(values=ColorKeyDataID)

UMAP
DimPlot(Seurat_obj_study2, reduction = "umap",pt.size =1, group.by = "Class")+scale_color_manual(values=ColorKeyDataID)

DimPlot(Seurat_obj_study2, reduction = "umap",pt.size =1, label = FALSE, repel = TRUE)+scale_color_manual(values=ColorKeyDataID)

Create a data frame for barplot
seurat_umap_study2 = data.frame(Seurat_obj_study2@reductions$umap@cell.embeddings)
seurat_umap_study2$Clusters = Seurat_obj_study2@meta.data$seurat_clusters
seurat_umap_study2$Class = Seurat_obj_study2@meta.data$Class
seurat_umap_study2$Cell_type = Seurat_obj_study2@meta.data$Cell_type
colnames(seurat_umap_study2) = c("UMAP1","UMAP2","Clusters","Class","Cell_type")
barplot
ggplot(seurat_umap_study2, aes(x=Clusters, fill = Cell_type)) + theme_classic()+
geom_bar(stat="count")+scale_color_manual()+
scale_fill_manual(values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))+
theme(legend.text = element_text(size=14),
plot.title = element_text(size=16),
legend.title=element_text(size=20),axis.text=element_text(size=20),
axis.title=element_text(size=22,face="bold")) +
guides(colour = guide_legend(override.aes = list(size = 6)))

ARI, NMI and Cluster purity
aricode::ARI(seurat_umap_study2$Clusters,seurat_umap_study2$Cell_type)
## [1] 0.1884311
aricode::NMI(seurat_umap_study2$Clusters,seurat_umap_study2$Cell_type)
## [1] 0.2490442
ClusterPurity <- function(clusters, classes) {
sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(seurat_umap_study2$Clusters,seurat_umap_study2$Cell_type)
## [1] 0.9793689