Here, Harmony is executed on the Seurat object for Study 2 (TPM) data. The parameters and commands are derived from the Harmony documentation.

source("~/Supplemental_code/Data_integration.R")
library(Seurat)
library(dplyr)
library(harmony)
library(ggplot2)

Load datasets

#Create Expression data list
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._metaData.RData")

Data integration based on common genes

Stydy2Data = Data_integration(data_list =list(Poonia_et_al._TPMData,Ding_et_al._WBC1_TPMData,
                                              Ebright_et_al._TPMData,Ding_et_al._WBC2_TPMData))
Stydy2Datametadata = rbind(Poonia_et_al._metaData,Ding_et_al._WBC1_metaData,
                           Ebright_et_al._metaData,Ding_et_al._WBC2_metaData)

Skipping normalization step as we are taking log transformed length normalized data (TPM)

harmony_obj_study2 <- CreateSeuratObject(counts = log2(Stydy2Data+1), project = "Harmony_seurat2", min.cells = 5,meta.data = Stydy2Datametadata) %>%
  FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 
  ScaleData(verbose = FALSE) %>% 
  RunPCA(pc.genes = pbmc@var.genes, npcs = 20, verbose = FALSE)

Run RunHarmony Seurat wrapper

harmony_obj_study2 = harmony::RunHarmony(harmony_obj_study2,group.by.vars = "Class")

harmony_embeddings <- Embeddings(harmony_obj_study2, 'harmony')
harmony_embeddings[1:5, 1:5]
##                     harmony_1 harmony_2  harmony_3 harmony_4 harmony_5
## 1851009049_CS5_S22   4.226928  8.145651 -10.689047 -19.95081 -14.00140
## 1851013039_CS34_S53  3.454174  6.820961  -9.595841 -18.90189 -13.47019
## 1851009049_CS14_S17  3.589440  7.510394  -9.520951 -18.82312 -13.05215
## 1851009049_CS9_S25   4.589478  7.783434 -10.098837 -19.13701 -13.86454
## 1851009049_CS4_S21   4.076038  7.588703 -10.189828 -18.86926 -12.84211
harmony_obj_study2 <- harmony_obj_study2 %>% 
  RunUMAP(reduction = "harmony", dims = 1:20) %>% 
  FindNeighbors(reduction = "harmony", dims = 1:20) %>% 
  FindClusters(resolution = 0.5) %>% 
  identity()
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1648
## Number of edges: 51526
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9210
## Number of communities: 12
## Elapsed time: 0 seconds

visualization colorkey

ColorKeyDataID = c("peru","steelblue","darkolivegreen4",
                   "palevioletred4","darkcyan","darkorchid4",
                   "darkslategray","firebrick1","salmon3",
                   "paleturquoise1","mediumaquamarine",
                   "greenyellow","black","deepskyblue3","mediumblue",
                   "darkred","gold","gray50","hotpink","khaki3",
                   "yellow4","lavender","cornsilk4","orchid4",
                   "yellow3", "darkgreen","skyblue1","khaki4",
                   "tan4","pink")

PCA plot

DimPlot(object = harmony_obj_study2, reduction = "pca", pt.size = 1,group.by = "Class")+scale_color_manual(values=ColorKeyDataID)

DimPlot(harmony_obj_study2, reduction = "pca", pt.size =1)+scale_color_manual(values=ColorKeyDataID)

Harmony plot

DimPlot(object = harmony_obj_study2, reduction = "harmony", pt.size = 1,group.by = "Class") +scale_color_manual(values=ColorKeyDataID)

DimPlot(object = harmony_obj_study2, reduction = "harmony", pt.size = 1) +scale_color_manual(values=ColorKeyDataID)

UMAP plot

DimPlot(harmony_obj_study2, reduction = "umap", group.by = "Class", pt.size =1)+scale_color_manual(values=ColorKeyDataID)

DimPlot(harmony_obj_study2, reduction = "umap", label = FALSE, pt.size =1)+scale_color_manual(values=ColorKeyDataID)

barplot

harmony_df_study2 = harmony_obj_study2@meta.data 

ggplot(harmony_df_study2, aes(x=seurat_clusters, fill = Cell_type)) + theme_classic()+
         geom_bar(stat="count")+scale_color_manual()+
         scale_fill_manual(values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))+
         theme(legend.text = element_text(size=14),
         plot.title = element_text(size=16),
         legend.title=element_text(size=20),axis.text=element_text(size=20),
         axis.title=element_text(size=22,face="bold")) +
         guides(colour = guide_legend(override.aes = list(size = 6)))

ARI, NMI and Cluster purity

aricode::ARI(harmony_df_study2$seurat_clusters,harmony_df_study2$Cell_type)
## [1] 0.170961
aricode::NMI(harmony_df_study2$seurat_clusters,harmony_df_study2$Cell_type)
## [1] 0.2151725
ClusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(harmony_df_study2$seurat_clusters,harmony_df_study2$Cell_type)
## [1] 0.9296117