library(Linnorm)
library(SingleCellExperiment)
library(GSVA)
library(ggplot2)
library(factoextra)
library(NbClust)
library(reticulate)
library(PCAtools)
library(umap)
library(dplyr)
library(ggpubr)

Source all the scripts

source("CreateSingleCellObject.R")
source("DDLK_Clust.R")
source("Gene_Violin_plots.R")
source("PathwayEnrichmentScore.R")
source("Stouffer_score.R")
source("unCTC_pathway_plots.R")
source("unCTC_libraries.R")

Load datasets for Study 1

load("~/Supplemental_code/unCTC_datasets/Zheng_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Zheng_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Velten_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Velten_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Sarioglu_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Sarioglu_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Jordan_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Jordan_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Aceto_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Aceto_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Yu_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Yu_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Ting_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ting_et_al._Data.RData")

Make all data and meta data list

#Create Data list
dataList = list(Velten_et_al._Data,Ting_et_al._Data,
                Yu_et_al._Data,Sarioglu_et_al._Data,
                Jordan_et_al._Data,Aceto_et_al._Data,
                Zheng_et_al._Data)

#Create Data Id's list
dataId = list("Velten_et_al._Data","Ting_et_al._Data",
              "Yu_et_al._Data","Sarioglu_et_al._Data",
              "Jordan_et_al._Data","Aceto_et_al._Data",
              "Zheng_et_al._Data")
#Create Meta data list
MetaData = list(Velten_et_al._metaData,Ting_et_al._metaData,
                Yu_et_al._metaData,Sarioglu_et_al._metaData,
                Jordan_et_al._metaData,Aceto_et_al._metaData,
                Zheng_et_al._metaData)

Load geneset

#load geneset 
load("~/Supplemental_code/unCTC_datasets/c2.all.v7.2.symbols.RData")
genesets = c2.all.v7.2.symbols

Call PathwayEnrichmentScore() method to calculate pathway scores.

invisible({capture.output({

Pathway_score = PathwayEnrichmentScore(data_list =dataList,
                                              data_id = dataId,
                                              Genesets = genesets,
                                              min.size=10,
                                              max.size = 500,
                                              min_Sample = 10,
                                              min_Gene = 1000,
                                              Parallel_threads=8L)

})})

Call DDLK_Clust() method. We should have python3 as a default python path to call this method.

library(reticulate)
#use_python("/home/saritap/.local/lib/python3.7/site-packages", required = T)
#reticulate::py_config()
#If version is different from the the given path then restart session and
#give path again can change path

DDLK_Clusters3 = DDLK_Clust(PathwayScore = Pathway_score$Pathway_score,
                                  PathwayMetaData = Pathway_score$Pathway_metadata,
                                  n = 3,
                                  out.dir = getwd(),
                                  MetaData = MetaData
)

PCA plots

p1 <- pca(DDLK_Clusters3$Pathway_score, metadata = DDLK_Clusters3$PathwayDDLK_clust, removeVar = 0.1)
gsva_pca_df = data.frame(PC1 = p1$rotated$PC1,PC2 = p1$rotated$PC2,
                         DDLK_Clusters3$PathwayDDLK_clust)
rownames(gsva_pca_df) = rownames(DDLK_Clusters3$PathwayDDLK_clust)

# Color key for visualization
ColorKeyDataID = c("coral4","darkcyan","steelblue","orangered",
                   "darkolivegreen4","lightsteelblue3",
                   "darkorchid4","darkslategray","salmon3",
                   "paleturquoise1","mediumaquamarine",
                   "greenyellow","black","deepskyblue3","mediumblue",
                   "peru","gold","gray50","hotpink","khaki3",
                   "yellow4","lavender","cornsilk4","orchid4",
                   "yellow3", "darkgreen","skyblue1","khaki4",
                   "tan4","firebrick1","pink")
gsva_pca_df %>%
  ggplot(aes(x = PC1, y = PC2, col = DataID)) +
  geom_point(size = 6, stroke = 0.2, shape = 16) +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position="right") +
  theme(legend.text = element_text(size=14),
        plot.title = element_text(size=16),
        legend.title=element_text(size=14)) +
  guides(colour = guide_legend(override.aes = list(size = 6))) +
  scale_x_continuous(minor_breaks = seq(-10, 10, 5)) +
  scale_y_continuous(minor_breaks = seq(-10, 10, 5))+
  scale_colour_manual(values=ColorKeyDataID)

gsva_pca_df %>%
  ggplot(aes(x = PC1, y = PC2, col = Clusters)) +
  geom_point(size = 6, stroke = 0.2, shape = 20) +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position="right") +
  theme(legend.text = element_text(size=14),
        plot.title = element_text(size=16),
        legend.title=element_text(size=14)) +
  guides(colour = guide_legend(override.aes = list(size = 6))) +
  scale_x_continuous(minor_breaks = seq(-10, 10, 5)) +
  scale_y_continuous(minor_breaks = seq(-10, 10, 5))+
  scale_colour_manual(values=ColorKeyDataID)

UMAP plots

Pathway.score = DDLK_Clusters3$Pathway_score
Pathway.metadata= DDLK_Clusters3$PathwayDDLK_clust
Pathway.score = Pathway.score[,rownames(Pathway.metadata)]
set.seed(100)
umap_path = umap(t(Pathway.score),n_neighbors= 50,
                 n_components= 2,metric= "euclidean",n_epochs= 100,
                 min_dist = 0.1,init ="spectral",random_state=123,alpha=2.5,gamma=2.5,
                 negative_sample_rate=5)
# umap_path = umap(t((Pathway_score)),n_neighbors= 15,
#                  n_components= 3,metric= "euclidean",n_epochs= 10,
#                  min_dist = 0.01,init ="spectral",random_state=1234,alpha=2,gamma=2,
#                  negative_sample_rate=5)

umap_df = data.frame(UMAP1 = umap_path$layout[,1],
                     UMAP2 = umap_path$layout[,2],
                     Pathway.metadata)

#UMAP1 and UMAP2 class wise plot
#UMAP1 and UMAP2 cluster wise plot
#Cluster_labels = gsva_pca_df[,Color_cluster]
umap_df %>%
  ggplot(aes(x = UMAP1, y = UMAP2, col = DataID)) +
  geom_point(size = 6, stroke = 0.2, shape = 16) +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position="right") +
  theme(legend.text = element_text(size=14),
        plot.title = element_text(size=16),
        legend.title=element_text(size=14)) +
  guides(colour = guide_legend(override.aes = list(size = 6))) +
  scale_x_continuous(minor_breaks = seq(-10, 10, 5)) +
  scale_y_continuous(minor_breaks = seq(-10, 10, 5))+
  scale_colour_manual(values=ColorKeyDataID)

umap_df %>%
  ggplot(aes(x = UMAP1, y = UMAP2, col = Clusters)) +
  geom_point(size = 6, stroke = 0.2, shape = 20) +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position="right") +
  theme(legend.text = element_text(size=14),
        plot.title = element_text(size=16),
        legend.title=element_text(size=14)) +
  guides(colour = guide_legend(override.aes = list(size = 6))) +
  scale_x_continuous(minor_breaks = seq(-10, 10, 5)) +
  scale_y_continuous(minor_breaks = seq(-10, 10, 5))+
  scale_colour_manual(values=ColorKeyDataID)

ggplot(DDLK_Clusters3$PathwayDDLK_clust, aes(x=Clusters, fill = Cell_type)) + theme_classic()+
  geom_bar(stat="count")+scale_color_manual()+
  scale_fill_manual(values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))+
  theme(legend.text = element_text(size=14),
        plot.title = element_text(size=16),
        legend.title=element_text(size=20),axis.text=element_text(size=20),
        axis.title=element_text(size=22,face="bold")) +
  guides(colour = guide_legend(override.aes = list(size = 6)))

Stouffer’s score

marker_genes = read.delim("~/Supplemental_code/unCTC_datasets/PanglaoDB_markers_27_Mar_2020.tsv")

Stouffer’s score for Epithelial marker genes

Epi_marker_symbol = marker_genes[marker_genes$cell.type=="Epithelial cells",]
#Calculate Stouffer's score for Epithelial genes
S_Epigene = Stouffer_score(data_list = dataList,
                           min_Sample = 5,
                           min_Gene = 500,
                           gene_list = Epi_marker_symbol$official.gene.symbol,
                           MetaData = MetaData,
                           data_id = dataId,
                           Groupby = "Clusters",
                           DDLKCluster_data = DDLK_Clusters3)



ggplot(S_Epigene$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
  geom_boxplot(outlier.shape = NA) + theme_classic()+
  scale_fill_manual(values = c("deepskyblue3","darkred","darkgreen","dark turquoise"))+
  ggtitle("Stouffer's score for the Epithelial genes")+
  stat_compare_means(comparisons = S_Epigene$comparisons,label = "p.signif",
                     method = "t.test",ref.group = ".all.")

Stouffer’s score for B and T Lymphocyte marker genes.

T_cell_marker = marker_genes[marker_genes$cell.type=="T cells",]
B_cell_marker = marker_genes[marker_genes$cell.type=="B cells",]

S_blood = Stouffer_score(data_list = dataList,
                           min_Sample = 5,
                           min_Gene = 500,
                           gene_list = unique(B_cell_marker$official.gene.symbol),
                           MetaData = MetaData,
                           data_id = dataId,
                           Groupby = "Clusters",
                           DDLKCluster_data = DDLK_Clusters3)



ggplot(S_blood$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
  geom_boxplot(outlier.shape = NA) + theme_classic() +
  scale_fill_manual(values = c("deepskyblue3","darkred","darkgreen","dark turquoise"))+
  ggtitle("Stouffer's score for the B lymphocyte genes")+
  stat_compare_means(comparisons = S_blood$comparisons,label = "p.signif",
                     method = "t.test",ref.group = ".all.")

S_blood_T = Stouffer_score(data_list = dataList,
                           min_Sample = 5,
                           min_Gene = 500,
                           gene_list = unique(T_cell_marker$official.gene.symbol),
                           MetaData = MetaData,
                           data_id = dataId,
                           Groupby = "Clusters",
                           DDLKCluster_data = DDLK_Clusters3)


ggplot(S_blood_T$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
  geom_boxplot(outlier.shape = NA) + theme_classic() +
  scale_fill_manual(values = c("deepskyblue3","darkred","darkgreen","dark turquoise"))+
  ggtitle("Stouffer's score for the T lymphocyte genes")+
  stat_compare_means(comparisons = S_blood_T$comparisons,label = "p.signif",
                     method = "t.test",ref.group = ".all.")

ARI, NMI and Cluster purity

aricode::ARI(DDLK_Clusters3$PathwayDDLK_clust$Clusters,DDLK_Clusters3$PathwayDDLK_clust$Cell_type)
## [1] 0.3139282
aricode::NMI(DDLK_Clusters3$PathwayDDLK_clust$Clusters,DDLK_Clusters3$PathwayDDLK_clust$Cell_type)
## [1] 0.358331
ClusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(DDLK_Clusters3$PathwayDDLK_clust$Clusters,DDLK_Clusters3$PathwayDDLK_clust$Cell_type)
## [1] 0.9949239