Load datasets for Study 1
load("~/Supplemental_code/unCTC_datasets/Zheng_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Zheng_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Velten_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Velten_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Sarioglu_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Sarioglu_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Jordan_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Jordan_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Aceto_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Aceto_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Yu_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Yu_et_al._Data.RData")
load("~/Supplemental_code/unCTC_datasets/Ting_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ting_et_al._Data.RData")
Load geneset
#load geneset
load("~/Supplemental_code/unCTC_datasets/c2.all.v7.2.symbols.RData")
genesets = c2.all.v7.2.symbols
Call PathwayEnrichmentScore() method to calculate pathway
scores.
invisible({capture.output({
Pathway_score = PathwayEnrichmentScore(data_list =dataList,
data_id = dataId,
Genesets = genesets,
min.size=10,
max.size = 500,
min_Sample = 10,
min_Gene = 1000,
Parallel_threads=8L)
})})
Call DDLK_Clust() method. We should have python3 as a default python
path to call this method.
library(reticulate)
#use_python("/home/saritap/.local/lib/python3.7/site-packages", required = T)
#reticulate::py_config()
#If version is different from the the given path then restart session and
#give path again can change path
DDLK_Clusters3 = DDLK_Clust(PathwayScore = Pathway_score$Pathway_score,
PathwayMetaData = Pathway_score$Pathway_metadata,
n = 3,
out.dir = getwd(),
MetaData = MetaData
)
PCA plots
p1 <- pca(DDLK_Clusters3$Pathway_score, metadata = DDLK_Clusters3$PathwayDDLK_clust, removeVar = 0.1)
gsva_pca_df = data.frame(PC1 = p1$rotated$PC1,PC2 = p1$rotated$PC2,
DDLK_Clusters3$PathwayDDLK_clust)
rownames(gsva_pca_df) = rownames(DDLK_Clusters3$PathwayDDLK_clust)
# Color key for visualization
ColorKeyDataID = c("coral4","darkcyan","steelblue","orangered",
"darkolivegreen4","lightsteelblue3",
"darkorchid4","darkslategray","salmon3",
"paleturquoise1","mediumaquamarine",
"greenyellow","black","deepskyblue3","mediumblue",
"peru","gold","gray50","hotpink","khaki3",
"yellow4","lavender","cornsilk4","orchid4",
"yellow3", "darkgreen","skyblue1","khaki4",
"tan4","firebrick1","pink")
gsva_pca_df %>%
ggplot(aes(x = PC1, y = PC2, col = DataID)) +
geom_point(size = 6, stroke = 0.2, shape = 16) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position="right") +
theme(legend.text = element_text(size=14),
plot.title = element_text(size=16),
legend.title=element_text(size=14)) +
guides(colour = guide_legend(override.aes = list(size = 6))) +
scale_x_continuous(minor_breaks = seq(-10, 10, 5)) +
scale_y_continuous(minor_breaks = seq(-10, 10, 5))+
scale_colour_manual(values=ColorKeyDataID)

gsva_pca_df %>%
ggplot(aes(x = PC1, y = PC2, col = Clusters)) +
geom_point(size = 6, stroke = 0.2, shape = 20) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position="right") +
theme(legend.text = element_text(size=14),
plot.title = element_text(size=16),
legend.title=element_text(size=14)) +
guides(colour = guide_legend(override.aes = list(size = 6))) +
scale_x_continuous(minor_breaks = seq(-10, 10, 5)) +
scale_y_continuous(minor_breaks = seq(-10, 10, 5))+
scale_colour_manual(values=ColorKeyDataID)

UMAP plots
Pathway.score = DDLK_Clusters3$Pathway_score
Pathway.metadata= DDLK_Clusters3$PathwayDDLK_clust
Pathway.score = Pathway.score[,rownames(Pathway.metadata)]
set.seed(100)
umap_path = umap(t(Pathway.score),n_neighbors= 50,
n_components= 2,metric= "euclidean",n_epochs= 100,
min_dist = 0.1,init ="spectral",random_state=123,alpha=2.5,gamma=2.5,
negative_sample_rate=5)
# umap_path = umap(t((Pathway_score)),n_neighbors= 15,
# n_components= 3,metric= "euclidean",n_epochs= 10,
# min_dist = 0.01,init ="spectral",random_state=1234,alpha=2,gamma=2,
# negative_sample_rate=5)
umap_df = data.frame(UMAP1 = umap_path$layout[,1],
UMAP2 = umap_path$layout[,2],
Pathway.metadata)
#UMAP1 and UMAP2 class wise plot
#UMAP1 and UMAP2 cluster wise plot
#Cluster_labels = gsva_pca_df[,Color_cluster]
umap_df %>%
ggplot(aes(x = UMAP1, y = UMAP2, col = DataID)) +
geom_point(size = 6, stroke = 0.2, shape = 16) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position="right") +
theme(legend.text = element_text(size=14),
plot.title = element_text(size=16),
legend.title=element_text(size=14)) +
guides(colour = guide_legend(override.aes = list(size = 6))) +
scale_x_continuous(minor_breaks = seq(-10, 10, 5)) +
scale_y_continuous(minor_breaks = seq(-10, 10, 5))+
scale_colour_manual(values=ColorKeyDataID)

umap_df %>%
ggplot(aes(x = UMAP1, y = UMAP2, col = Clusters)) +
geom_point(size = 6, stroke = 0.2, shape = 20) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position="right") +
theme(legend.text = element_text(size=14),
plot.title = element_text(size=16),
legend.title=element_text(size=14)) +
guides(colour = guide_legend(override.aes = list(size = 6))) +
scale_x_continuous(minor_breaks = seq(-10, 10, 5)) +
scale_y_continuous(minor_breaks = seq(-10, 10, 5))+
scale_colour_manual(values=ColorKeyDataID)

ggplot(DDLK_Clusters3$PathwayDDLK_clust, aes(x=Clusters, fill = Cell_type)) + theme_classic()+
geom_bar(stat="count")+scale_color_manual()+
scale_fill_manual(values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))+
theme(legend.text = element_text(size=14),
plot.title = element_text(size=16),
legend.title=element_text(size=20),axis.text=element_text(size=20),
axis.title=element_text(size=22,face="bold")) +
guides(colour = guide_legend(override.aes = list(size = 6)))

Stouffer’s score
marker_genes = read.delim("~/Supplemental_code/unCTC_datasets/PanglaoDB_markers_27_Mar_2020.tsv")
Stouffer’s score for Epithelial marker genes
Epi_marker_symbol = marker_genes[marker_genes$cell.type=="Epithelial cells",]
#Calculate Stouffer's score for Epithelial genes
S_Epigene = Stouffer_score(data_list = dataList,
min_Sample = 5,
min_Gene = 500,
gene_list = Epi_marker_symbol$official.gene.symbol,
MetaData = MetaData,
data_id = dataId,
Groupby = "Clusters",
DDLKCluster_data = DDLK_Clusters3)
ggplot(S_Epigene$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
geom_boxplot(outlier.shape = NA) + theme_classic()+
scale_fill_manual(values = c("deepskyblue3","darkred","darkgreen","dark turquoise"))+
ggtitle("Stouffer's score for the Epithelial genes")+
stat_compare_means(comparisons = S_Epigene$comparisons,label = "p.signif",
method = "t.test",ref.group = ".all.")

Stouffer’s score for B and T Lymphocyte marker genes.
T_cell_marker = marker_genes[marker_genes$cell.type=="T cells",]
B_cell_marker = marker_genes[marker_genes$cell.type=="B cells",]
S_blood = Stouffer_score(data_list = dataList,
min_Sample = 5,
min_Gene = 500,
gene_list = unique(B_cell_marker$official.gene.symbol),
MetaData = MetaData,
data_id = dataId,
Groupby = "Clusters",
DDLKCluster_data = DDLK_Clusters3)
ggplot(S_blood$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
geom_boxplot(outlier.shape = NA) + theme_classic() +
scale_fill_manual(values = c("deepskyblue3","darkred","darkgreen","dark turquoise"))+
ggtitle("Stouffer's score for the B lymphocyte genes")+
stat_compare_means(comparisons = S_blood$comparisons,label = "p.signif",
method = "t.test",ref.group = ".all.")

S_blood_T = Stouffer_score(data_list = dataList,
min_Sample = 5,
min_Gene = 500,
gene_list = unique(T_cell_marker$official.gene.symbol),
MetaData = MetaData,
data_id = dataId,
Groupby = "Clusters",
DDLKCluster_data = DDLK_Clusters3)
ggplot(S_blood_T$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
geom_boxplot(outlier.shape = NA) + theme_classic() +
scale_fill_manual(values = c("deepskyblue3","darkred","darkgreen","dark turquoise"))+
ggtitle("Stouffer's score for the T lymphocyte genes")+
stat_compare_means(comparisons = S_blood_T$comparisons,label = "p.signif",
method = "t.test",ref.group = ".all.")

ARI, NMI and Cluster purity
aricode::ARI(DDLK_Clusters3$PathwayDDLK_clust$Clusters,DDLK_Clusters3$PathwayDDLK_clust$Cell_type)
## [1] 0.3139282
aricode::NMI(DDLK_Clusters3$PathwayDDLK_clust$Clusters,DDLK_Clusters3$PathwayDDLK_clust$Cell_type)
## [1] 0.358331
ClusterPurity <- function(clusters, classes) {
sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(DDLK_Clusters3$PathwayDDLK_clust$Clusters,DDLK_Clusters3$PathwayDDLK_clust$Cell_type)
## [1] 0.9949239