Include libraries

library(Linnorm)
library(SingleCellExperiment)
library(GSVA)
library(ggplot2)
library(factoextra)
library(NbClust)
library(reticulate)
library(PCAtools)
library(umap)
library(dplyr)
library(infercnv)
library(readtext)
library(D3GB)
library(GenomicRanges)
library(IRanges)
library(S4Vectors)

Source all the scripts

source("~/Supplemental_code/unCTC/CreateSingleCellObject.R")
source("~/Supplemental_code/unCTC/DDLK_Clust.R")
source("~/Supplemental_code/unCTC/Gene_Violin_plots.R")
source("~/Supplemental_code/unCTC/PathwayEnrichmentScore.R")
source("~/Supplemental_code/unCTC/Stouffer_score.R")
source("~/Supplemental_code/unCTC/unCTC_pathway_plots.R")
source("~/Supplemental_code/unCTC/unCTC_libraries.R")
source("~/Supplemental_code/unCTC/GroupsDiffGenes.R")
source("~/Supplemental_code/unCTC/GroupsDiffPathways.R")
source("~/Supplemental_code/unCTC/CNV_alterations.R")

Load datasets

#Create Expression data list
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._CountData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_CountData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_CountData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._CountData.RData")

Make metaData file

Poonia_et_al._CountmetaData = as.data.frame(rep("Poonia_et_al._CountData",ncol(Poonia_et_al._CountData)))
colnames(Poonia_et_al._CountmetaData) = "Data_source"
rownames(Poonia_et_al._CountmetaData) = colnames(Poonia_et_al._CountData)
Poonia_et_al._CountmetaData$Class = "CTC"

Ebright_et_al._CountmetaData = as.data.frame(rep("Ebright_et_al._CountData",ncol(Ebright_et_al._CountData)))
colnames(Ebright_et_al._CountmetaData) = "Data_source"
rownames(Ebright_et_al._CountmetaData) = colnames(Ebright_et_al._CountData)
Ebright_et_al._CountmetaData$Class = "CTC"

Ding_et_al._WBC1_CountmetaData= as.data.frame(rep("Ding_et_al._WBC1_CountData",ncol(Ding_et_al._WBC1_CountData)))
colnames(Ding_et_al._WBC1_CountmetaData) = "Data_source"
rownames(Ding_et_al._WBC1_CountmetaData) = colnames(Ding_et_al._WBC1_CountData)
Ding_et_al._WBC1_CountmetaData$Class = "WBC"

Ding_et_al._WBC2_CountmetaData =as.data.frame(rep("Ding_et_al._WBC2_CountData",ncol(Ding_et_al._WBC2_CountData)))
colnames(Ding_et_al._WBC2_CountmetaData) = "Data_source"
rownames(Ding_et_al._WBC2_CountmetaData) = colnames(Ding_et_al._WBC2_CountData)
Ding_et_al._WBC2_CountmetaData$Class = "WBC"

Load geneset

This package includes one geneset, which is taken from molecular signature database.

load("~/Supplemental_code/unCTC_datasets/c2.all.v7.2.symbols.RData")
#Create Expression data list
dataList = list(Poonia_et_al._CountData,Ebright_et_al._CountData,
                Ding_et_al._WBC1_CountData,Ding_et_al._WBC2_CountData)

#Create Data Id's list
dataId = list("Poonia_et_al._CountData","Ebright_et_al._CountData",
              "Ding_et_al._WBC1_CountData","Ding_et_al._WBC2_CountData")

#Create Meta data list
MetaData = list(Poonia_et_al._CountmetaData, Ebright_et_al._CountmetaData, 
                Ding_et_al._WBC1_CountmetaData, Ding_et_al._WBC2_CountmetaData )

#Genesets given with this package
genesets = c2.all.v7.2.symbols

Calculate pathway enrichment score

PathwayEnrichmentScore uses the following steps:

Integrate data passed in the list based on common genes.

PathwayEnrichmentScore requires following inputs:
* data_list: List of expression data matrices
* data_id: List of expression data matrices’ name in the same order. * Genesets: List of pathways * min.size: Minimum size of genes in pathways/Genesets, Default is 10
* max.size: Maximum size of genes in pathways/Genesets, Default is 500
* min_Sample: filter out genes which are not expressedin at least min_Sample cells, Default is 5.
* min_Gene: Filter out those cells which do not express at least min_Gene genes, Default is 1500. * Parallel_threads : Number of threads in parallel to execute process

invisible({capture.output({

Pathway_score = unCTC::PathwayEnrichmentScore(data_list =dataList,
                                        data_id = dataId,
                                        Genesets = genesets,
                                        min.size=10,
                                        max.size = 500,
                                        min_Sample = 5,
                                        min_Gene = 1500,
                                       Parallel_threads=8L)
})})

Calculate the optimal number of clusters for pathway enrichment score matrix

For the above pathway enrichment score matrix, we calculate the number of clusters using the Elbow method.

#Retrive information about the version of python being used by reticulate
#reticulate::py_config()
#If version is different from the the given path then restart session and
#give path again can change path

DDLK_Clusters = DDLK_Clust(PathwayScore = Pathway_score$Pathway_score,
                           PathwayMetaData = Pathway_score$Pathway_metadata,
                           n = 4, MetaData = MetaData,
                           out.dir = getwd()
                           )

unCTC plots:

unCTC_plots Plots principal components of pathway enrichment score.

Required input for unCTC_plots method is:

plots = unCTC_pathway_plots(Pathway_score = DDLK_Clusters$Pathway_score,
                    Pathway_metadata = DDLK_Clusters$PathwayDDLK_clust,
                    colorby = "Data_id",
                    Color_cluster = "Clusters",
                    pairsplotLegend = "none")

PCA plots

plots$group_by_Class_PCA

plots$group_by_Cluster_PCA

UMAP plots

plots$group_by_Class_umap

plots$group_by_Cluster_umap

  • stacked bar plot shows the count of CTCs (red) and WBCs (blue), while the x-axis shows clusters.
library(ggplot2)
ggplot(DDLK_Clusters$PathwayDDLK_clust, aes(x=Clusters, fill = Class))+
       theme_classic()+
       geom_bar(stat="count")+
       scale_color_manual()+
       scale_fill_manual(
       values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))

ARI, NMI and Cluster purity

print("ARI:")
## [1] "ARI:"
aricode::ARI(DDLK_Clusters$PathwayDDLK_clust$Clusters,DDLK_Clusters$PathwayDDLK_clust$Class)
## [1] 0.5495047
print("NMI:")
## [1] "NMI:"
aricode::NMI(DDLK_Clusters$PathwayDDLK_clust$Clusters,DDLK_Clusters$PathwayDDLK_clust$Class)
## [1] 0.4795896
print("ClusterPurity:")
## [1] "ClusterPurity:"
ClusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(DDLK_Clusters$PathwayDDLK_clust$Clusters,DDLK_Clusters$PathwayDDLK_clust$Class)
## [1] 0.9875098