Here, Harmony is executed on the Seurat object for Study 1 (Count) data. The parameters and commands are derived from the Harmony documentation.

source("~/Supplemental_code/Data_integration.R")

Load libraries

library(Seurat)
library(dplyr)
library(harmony)
library(ggplot2)

Load datasets for Study 1

load("~/Supplemental_code/unCTC_datasets/Zheng_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Zheng_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Velten_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Velten_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Sarioglu_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Sarioglu_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Jordan_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Jordan_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Aceto_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Aceto_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Yu_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Yu_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Ting_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ting_et_al._Data.RData")

Data integration based on common genes

Data2 = Data_integration(data_list=list(Velten_et_al._Data,
                                      Ting_et_al._Data,
                                      Yu_et_al._Data,
                                      Sarioglu_et_al._Data,
                                      Jordan_et_al._Data,
                                      Aceto_et_al._Data,
                                      Zheng_et_al._Data))
data2_metadata = rbind(Velten_et_al._metaData,
                       Ting_et_al._metaData,
                       Yu_et_al._metaData,
                       Sarioglu_et_al._metaData,
                       Jordan_et_al._metaData,
                       Aceto_et_al._metaData,
                       Zheng_et_al._metaData)
harmony_obj_study1 <- CreateSeuratObject(counts = Data2, project = "Harmony_seurat", min.cells = 5,meta.data = data2_metadata) %>%
  Seurat::NormalizeData(verbose = FALSE) %>%
  FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 
  ScaleData(verbose = FALSE) %>% 
  RunPCA(pc.genes = harmony_obj_study1@var.genes, npcs = 20, verbose = FALSE)

Run RunHarmony Seurat wrapper

harmony_obj_study1 = harmony::RunHarmony(harmony_obj_study1, group.by.vars = "DataID")

harmony_embeddings <- Embeddings(harmony_obj_study1, 'harmony')
harmony_embeddings[1:5, 1:5]
##                 harmony_1   harmony_2  harmony_3 harmony_4   harmony_5
## I1_plate10_A_10  6.754911 -19.1408528 -1.3560431  1.432254 -0.02593458
## I1_plate10_A_11  3.816022 -16.3858334  0.8536223 -2.559157  3.53954807
## I1_plate10_A_12 -2.653742  -2.0642228  6.4434103 -1.120137  1.21032208
## I1_plate10_A_1  -2.250052   0.8471155  1.0422958 -2.755394  1.07739774
## I1_plate10_A_4  35.390286  13.4847486 -5.5926939  0.268812  7.59681746
harmony_obj_study1 <- harmony_obj_study1 %>% 
  RunUMAP(reduction = "harmony", dims = 1:20) %>% 
  FindNeighbors(reduction = "harmony", dims = 1:20) %>% 
  FindClusters(resolution = 0.5) %>% 
  identity()
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1184
## Number of edges: 41407
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9090
## Number of communities: 13
## Elapsed time: 0 seconds

Colorkey for Visualization

ColorKeyDataID = c("coral4","darkcyan","steelblue","orangered",
                   "darkolivegreen4","lightsteelblue3",
                   "darkorchid4","darkslategray","salmon3",
                   "paleturquoise1","mediumaquamarine",
                   "greenyellow","black","deepskyblue3","mediumblue",
                   "peru","gold","gray50","hotpink","khaki3",
                   "yellow4","lavender","cornsilk4","orchid4",
                   "yellow3", "darkgreen","skyblue1","khaki4",
                   "tan4","firebrick1","pink")

PCA plot

DimPlot(object = harmony_obj_study1, reduction = "pca", pt.size = 1, group.by = "DataID")+scale_color_manual(values=ColorKeyDataID)

DimPlot(object = harmony_obj_study1, reduction = "pca", pt.size = 1)+scale_color_manual(values=ColorKeyDataID)

Harmony plot

DimPlot(object = harmony_obj_study1, reduction = "harmony", pt.size = 1, group.by = "DataID") +scale_color_manual(values=ColorKeyDataID)

DimPlot(object = harmony_obj_study1, reduction = "harmony", pt.size = 1) +scale_color_manual(values=ColorKeyDataID)

UMAP plot

DimPlot(harmony_obj_study1, reduction = "umap", group.by = "DataID", pt.size = 1)+scale_color_manual(values=ColorKeyDataID)

DimPlot(harmony_obj_study1, reduction = "umap", pt.size = 1)+scale_color_manual(values=ColorKeyDataID)

barplot

harmony_df_study1 = harmony_obj_study1@meta.data
ggplot(harmony_df_study1, aes(x=seurat_clusters, fill = Cell_type)) + theme_classic()+
        geom_bar(stat="count")+scale_color_manual()+
        scale_fill_manual(values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))+
        theme(legend.text = element_text(size=14),
        plot.title = element_text(size=16),
        legend.title=element_text(size=20),axis.text=element_text(size=20),
        axis.title=element_text(size=22,face="bold")) +
        guides(colour = guide_legend(override.aes = list(size = 6)))

ARI, NMI and Cluster purity

aricode::ARI(harmony_df_study1$seurat_clusters,harmony_df_study1$Cell_type)
## [1] 0.07531007
aricode::NMI(harmony_df_study1$seurat_clusters,harmony_df_study1$Cell_type)
## [1] 0.05466334
ClusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(harmony_df_study1$seurat_clusters,harmony_df_study1$Cell_type)
## [1] 0.8758446