Include libraries

library(Linnorm)
library(SingleCellExperiment)
library(GSVA)
library(ggplot2)
library(factoextra)
library(NbClust)
library(reticulate)
library(PCAtools)
library(umap)
library(dplyr)
library(infercnv)
library(readtext)
library(D3GB)
library(GenomicRanges)
library(IRanges)
library(S4Vectors)
library(ggpubr)

Source all the scripts

source("~/Supplemental_code/unCTC/CreateSingleCellObject.R")
source("~/Supplemental_code/unCTC/DDLK_Clust.R")
source("~/Supplemental_code/unCTC/Gene_Violin_plots.R")
source("~/Supplemental_code/unCTC/PathwayEnrichmentScore.R")
source("~/Supplemental_code/unCTC/Stouffer_score.R")
source("~/Supplemental_code/unCTC/unCTC_pathway_plots.R")
source("~/Supplemental_code/unCTC/unCTC_libraries.R")
source("~/Supplemental_code/unCTC/GroupsDiffGenes.R")
source("~/Supplemental_code/unCTC/GroupsDiffPathways.R")
source("~/Supplemental_code/unCTC/CNV_alterations.R")

Load datasets for Study 2

#Create Expression data list
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._metaData.RData")

Load geneset

This package includes one geneset, which is taken from molecular signature database.

load("~/Supplemental_code/unCTC_datasets/c2.all.v7.2.symbols.RData")
#Create Expression data list
dataList = list(Poonia_et_al._TPMData,Ebright_et_al._TPMData,
                Ding_et_al._WBC1_TPMData,Ding_et_al._WBC2_TPMData)

#Create Data Id's list
dataId = list("Poonia_et_al._TPMData","Ebright_et_al._TPMData",
              "Ding_et_al._WBC1_TPMData","Ding_et_al._WBC2_TPMData")

#Create Meta data list
MetaData = list(Poonia_et_al._metaData, Ebright_et_al._metaData, 
                Ding_et_al._WBC1_metaData, Ding_et_al._WBC2_metaData )

#Genesets given with this package
genesets = c2.all.v7.2.symbols

Calculate pathway enrichment score

PathwayEnrichmentScore uses the following steps:

Integrate data passed in the list based on common genes.

PathwayEnrichmentScore requires following inputs:
* data_list: List of expression data matrices
* data_id: List of expression data matrices’ name in the same order. * Genesets: List of pathways * min.size: Minimum size of genes in pathways/Genesets, Default is 10
* max.size: Maximum size of genes in pathways/Genesets, Default is 500
* min_Sample: filter out genes which are not expressedin at least min_Sample cells, Default is 5.
* min_Gene: Filter out those cells which do not express at least min_Gene genes, Default is 1500. * Parallel_threads : Number of threads in parallel to execute process

invisible({capture.output({

Pathway_score = unCTC::PathwayEnrichmentScore(data_list =dataList,
                                        data_id = dataId,
                                        Genesets = genesets,
                                        min.size=10,
                                        max.size = 500,
                                        min_Sample = 5,
                                        min_Gene = 1500,
                                       Parallel_threads=8L)
})})

Calculate the optimal number of clusters for pathway enrichment score matrix

For the above pathway enrichment score matrix, we calculate the number of clusters using the Elbow method.

library(factoextra)
library(NbClust)
fviz_nbclust(Pathway_score$Pathway_score, kmeans, method = "wss") +
    geom_vline(xintercept = 4, linetype = 2)+
    labs(subtitle = "Elbow method")

DDLK Clusteing

DDLk_Clust need the following inputs

#Retrive information about the version of python being used by reticulate
#reticulate::py_config()
#If version is different from the the given path then restart session and
#give path again can change path

DDLK_Clusters = DDLK_Clust(PathwayScore = Pathway_score$Pathway_score,
                           PathwayMetaData = Pathway_score$Pathway_metadata,
                           n = 4,
                           out.dir = getwd(),
                           MetaData = MetaData
                           )

unCTC plots:

unCTC_plots Plots principal components of pathway enrichment score.

Required input for unCTC_plots method is:

plots = unCTC_pathway_plots(Pathway_score = DDLK_Clusters$Pathway_score,
                    Pathway_metadata = DDLK_Clusters$PathwayDDLK_clust,
                    colorby = "Data_id",
                    Color_cluster = "Clusters",
                    pairsplotLegend = "none")

PCA plots

plots$group_by_Class_PCA

plots$group_by_Cluster_PCA

UMAP plots

plots$group_by_Class_umap

plots$group_by_Cluster_umap

  • stacked bar plot shows the count of CTCs (red) and WBCs (blue), while the x-axis shows clusters.
library(ggplot2)
ggplot(DDLK_Clusters$PathwayDDLK_clust, aes(x=Clusters, fill = Cell_type))+
       theme_classic()+
       geom_bar(stat="count")+
       scale_color_manual()+
       scale_fill_manual(
       values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))

Differential genes

Provide differential genes between given groups.

Differential_genes require following inputs:

Diff_matrix = unCTC::Differential_genes(data_list=dataList,
                                 min_Sample = 5,
                                 min_Gene = 1500,
                                 DDLK_Clusters,
                                 Genesets = genesets,
                                 data_id=dataId,
                                 data_type = "Normalised",
                                 DifferentiateBy = "Clusters",
                                 up_gene_number = 200)

Heatmap showing the top 200 upregulated genes in the 4 clusters.

library(pheatmap)
library(viridis)
annotation = Diff_matrix$annotations
annotation$Data_id <- NULL
annotation$GroupID <- NULL
annotation$Cell_type <- NULL
ann = annotation[,c("HormoneStatus","Class","Clusters")] 

pheatmap(t(scale(t(Diff_matrix$DiffMat))),cluster_cols = FALSE,
         show_colnames = FALSE,cluster_rows = FALSE, show_rownames = FALSE,
         color = viridis(1000),annotation = ann)

Differential Pathways

Provide differential pathways between given groups.

Differential_pathways require following inputs:

Diff_path = unCTC::Differential_pathways(Pathway_score,
                      DDLK_Clusters = DDLK_Clusters,
                      DifferentiateBy = "Clusters",
                      up_pathways_number = 100
                      )

Out of top 100 upregulated pathways in each cluster we select relevent pathways from each cluster

annotation = Diff_path$annotations
annotation$Data_id <- NULL
annotation$GroupID <- NULL
annotation$Cell_type <- NULL
ann = annotation[,c("HormoneStatus","Class","Clusters")] 
Specific_pathways = c("BIOCARTA_THELPER_PATHWAY","BIOCARTA_TCYTOTOXIC_PATHWAY","BIOCARTA_CTL_PATHWAY","BIOCARTA_IL17_PATHWAY","BIOCARTA_CTLA4_PATHWAY","LEE_DIFFERENTIATING_T_LYMPHOCYTE","BIOCARTA_MONOCYTE_PATHWAY",
"ZHENG_FOXP3_TARGETS_IN_T_LYMPHOCYTE_DN","SPIELMAN_LYMPHOBLAST_EUROPEAN_VS_ASIAN_2FC_DN","GUTIERREZ_CHRONIC_LYMPHOCYTIC_LEUKEMIA_UP","GOERING_BLOOD_HDL_CHOLESTEROL_QTL_CIS",
"FARMER_BREAST_CANCER_CLUSTER_6","TURASHVILI_BREAST_NORMAL_DUCTAL_VS_LOBULAR_UP","YANG_BREAST_CANCER_ESR1_BULK_UP","NIKOLSKY_BREAST_CANCER_19Q13.1_AMPLICON",
"GINESTIER_BREAST_CANCER_ZNF217_AMPLIFIED_UP","HOLLERN_SOLID_NODULAR_BREAST_TUMOR_UP","FINETTI_BREAST_CANCER_KINOME_RED","NIKOLSKY_BREAST_CANCER_7Q21_Q22_AMPLICON",
"YANG_BREAST_CANCER_ESR1_UP","GINESTIER_BREAST_CANCER_ZNF217_AMPLIFIED_UP","WP_MAMMARY_GLAND_DEVELOPMENT_PATHWAY_INVOLUTION_STAGE_4_OF_4","MACLACHLAN_BRCA1_TARGETS_UP",
"REACTOME_ERBB2_ACTIVATES_PTK6_SIGNALING","REACTOME_PI3K_EVENTS_IN_ERBB2_SIGNALING","REACTOME_PI3K_EVENTS_IN_ERBB4_SIGNALING","REACTOME_SHC1_EVENTS_IN_ERBB2_SIGNALING","REACTOME_GRB2_EVENTS_IN_ERBB2_SIGNALING")
mat = Diff_path$DiffMatpathway[Specific_pathways,]
pheatmap(t(scale(t(mat))),cluster_cols = FALSE,
         show_colnames = FALSE,cluster_rows = FALSE, show_rownames = TRUE, fontsize_row =7,fontsize = 7,
         color = viridis(1000),annotation = ann)

Calcuate Stouffers score

Stouffer_score method uses the following steps:

The followings input are required to calculate Stouffer’s score:

With this package, we have given two types of gene list:

Load genelists

load("~/Supplemental_code/unCTC_datasets/Breast_elevated_genes.RData")
load("~/Supplemental_code/unCTC_datasets/Blood_specific_gene.RData")

Stouffer’s Score:

#Calculate Stouffer's score for Blood gene
S_WBC = Stouffer_score(data_list = dataList,
                         min_Sample = 5,
                         min_Gene = 1500,
                         gene_list =Blood_specific_gene,
                         data_id = dataId,
                         Groupby = "Clusters",
                         DDLKCluster_data = DDLK_Clusters)


#Calculate Stouffer's score for Breast elevated genes
S_Breast = Stouffer_score(data_list = dataList,
                           min_Sample = 5,
                           min_Gene = 1500,
                           gene_list = Breast_elevated_genes,
                           data_id = dataId,
                           Groupby = "Clusters",
                           DDLKCluster_data = DDLK_Clusters)

Stouffer’s Score Plot

For better colour visualization we are using following color key:

library(ggplot2)
library(ggpubr)
ColorKey = c("darkred","deepskyblue3","darkolivegreen4",
             "dark turquoise","pale violet red",
             "steelblue","forestgreen","gray2",
             "gray50","hotpink","lightslateblue",
             "tan4","yellow3","sienna4","orchid4")

For Immune genes:

ggplot(S_WBC$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
geom_boxplot(outlier.shape = NA) + theme_classic() +
scale_fill_manual(values = ColorKey) +
ggtitle("Immune gene signature")+
stat_compare_means(comparisons = S_WBC$comparisons,
label = "p.signif", method = "t.test",ref.group = ".all.")

For Breast elevated genes:

ggplot(S_Breast$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
geom_boxplot(outlier.shape = NA) + theme_classic() +
scale_fill_manual(values = ColorKey)+
ggtitle("Breast elevated gene signature")+
stat_compare_means(comparisons = S_Breast$comparisons,
label = "p.signif", method = "t.test",ref.group = ".all.")

Copy Number Variation Analysis:

inferCNV R package is used for analysing copy number variation for raw Count/TPM data. Along with all analysis of inferCNV, unCTC::CNV_alterations calculate addition and deletion position in p and q arms in test/cancerous/ diseased data as compared to reference/normal/healthy. To calculate p and q arm location from inferCNV events, we used GRCh37 cytoband information.

CNV_alterations require the following inputs:

Load gene order file

gencode_v19_gene_pos =unCTC::gencode_v19_gene_pos

Load data for copy number variation

load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._PBMC_CountData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._PBMC_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._CountData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._metaData.RData")

InferCNV between Poonia et al.’s CTCs and WBCs

Here we are not evaluating copy no variation script and only including results.

ref_data =Poonia_et_al._PBMC_CountData
ref_metadata = Poonia_et_al._PBMC_metaData
obs_data = Poonia_et_al._CountData
obs_metadata = Poonia_et_al._metaData
dataList1 = list(ref_data,obs_data)
dataId1 = list("WBC","CTC")
MetaData1= list(ref_metadata,obs_metadata)

CNV_Alterations1 = CNV_alterations(
                                    data_list= dataList1,
                                    data_id= dataId1,
                                    min_Sample = 5,
                                    min_Gene = 1500,
                                    path= getwd(),
                                    GenePositionFile= gencode_v19_gene_pos,
                                    threads_no=8,
                                    MetaData = MetaData1,
                                    Groupby = "GroupID",
                                    Reference_name = c("WBC"),
                                    obs.title ="Observations",
                                    ref.title = "References",
                                    cluster_by_groups = TRUE,
                                    out.Filename = "inferCNV"
                                    )

InferCNV by taking Ebright et al.’s CTCs as observation

load("/home/saritap/unCTC_datasets/GSE181279_Countdata.RData")
ref_data = GSE181279_Countdata
load("/home/saritap/unCTC_datasets/Ebright_et_al._CountData.RData")
obs_data = Ebright_et_al._CountData

dataList = list(ref_data,obs_data)
dataId = list("WBC","CTC")


CNV_Alterations2 = CNV_alterations(
                     data_list= dataList,
                     data_id= dataId,
                     min_Sample = 5,
                     min_Gene = 1500,
                     path= getwd(),      
                     GenePositionFile= gencode_v19_gene_pos,  
                     threads_no=8, 
                     Groupby = "Data_id",
                     Reference_name = c("WBC"), # WBC data as reference
                     obs.title ="Observations", 
                     ref.title = "References",
                     out.Filename = "inferCNV1" 
                     )

Gene_Violin_plots

Give violin plot for a given Canonical marker expression.

Gene_Violin_plots require input:
* data_list: List of expression data matrices
* data_id: List of expression data matrices name in the same order.
* min_Sample: filter out genes which are not expressedin at least min_Sample cells, Default is 5.
* min_Gene: Filter out those cells which do not express at least min_Gene genes, Default is 1500. * gene_symbol: Specific gene for which we want to see expression.
* MetaData: Optional, list of metadata of expression matrices. If given then columns of all metadata in the list must be identical.
* Groupby: Any column name from MetaData, which we want to use to see differential expression of the gene. Default is “data_id”.

# Gene Violin plot
PTPRC = Gene_Violin_plots(data_list =dataList,
                  data_id = dataId,
                  min_Sample = 5,
                  min_Gene = 1500,
                  gene_symbol = "PTPRC",
                  DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
                  Groupby = "Clusters")
NKG7 = Gene_Violin_plots(data_list =dataList,
                  data_id = dataId,
                  min_Sample = 5,
                  min_Gene = 1500,
                  gene_symbol = "NKG7",
                  DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
                  Groupby = "Clusters")

EPCAM = Gene_Violin_plots(data_list =dataList,
                  data_id = dataId,
                  min_Sample = 5,
                  min_Gene = 1500,
                  gene_symbol = "EPCAM",
                  DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
                  Groupby = "Clusters")
KRT18 = Gene_Violin_plots(data_list =dataList,
                  data_id = dataId,
                  min_Sample = 5,
                  min_Gene = 1500,
                  gene_symbol = "KRT18",
                  DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
                  Groupby = "Clusters")

Canonical marker expression

library(cowplot)
PTPRC$Violin_plot

NKG7$Violin_plot

EPCAM$Violin_plot

KRT18$Violin_plot

ARI, NMI and Cluster purity

aricode::ARI(DDLK_Clusters$PathwayDDLK_clust$Clusters,DDLK_Clusters$PathwayDDLK_clust$Cell_type)
## [1] 0.570144
aricode::NMI(DDLK_Clusters$PathwayDDLK_clust$Clusters,DDLK_Clusters$PathwayDDLK_clust$Cell_type)
## [1] 0.47599
ClusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(DDLK_Clusters$PathwayDDLK_clust$Clusters,DDLK_Clusters$PathwayDDLK_clust$Cell_type)
## [1] 0.982823