library(Linnorm)
library(SingleCellExperiment)
library(GSVA)
library(ggplot2)
library(factoextra)
library(NbClust)
library(reticulate)
library(PCAtools)
library(umap)
library(dplyr)
library(infercnv)
library(readtext)
library(D3GB)
library(GenomicRanges)
library(IRanges)
library(S4Vectors)
library(ggpubr)
source("~/Supplemental_code/unCTC/CreateSingleCellObject.R")
source("~/Supplemental_code/unCTC/DDLK_Clust.R")
source("~/Supplemental_code/unCTC/Gene_Violin_plots.R")
source("~/Supplemental_code/unCTC/PathwayEnrichmentScore.R")
source("~/Supplemental_code/unCTC/Stouffer_score.R")
source("~/Supplemental_code/unCTC/unCTC_pathway_plots.R")
source("~/Supplemental_code/unCTC/unCTC_libraries.R")
source("~/Supplemental_code/unCTC/GroupsDiffGenes.R")
source("~/Supplemental_code/unCTC/GroupsDiffPathways.R")
source("~/Supplemental_code/unCTC/CNV_alterations.R")
#Create Expression data list
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC1_TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ding_et_al._WBC2_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._TPMData.RData")
load("~/Supplemental_code/unCTC_datasets/Ebright_et_al._metaData.RData")
This package includes one geneset, which is taken from molecular signature database.
load("~/Supplemental_code/unCTC_datasets/c2.all.v7.2.symbols.RData")
#Create Expression data list
dataList = list(Poonia_et_al._TPMData,Ebright_et_al._TPMData,
Ding_et_al._WBC1_TPMData,Ding_et_al._WBC2_TPMData)
#Create Data Id's list
dataId = list("Poonia_et_al._TPMData","Ebright_et_al._TPMData",
"Ding_et_al._WBC1_TPMData","Ding_et_al._WBC2_TPMData")
#Create Meta data list
MetaData = list(Poonia_et_al._metaData, Ebright_et_al._metaData,
Ding_et_al._WBC1_metaData, Ding_et_al._WBC2_metaData )
#Genesets given with this package
genesets = c2.all.v7.2.symbols
PathwayEnrichmentScore uses the following steps:
Integrate data passed in the list based on common genes.
PathwayEnrichmentScore requires following inputs:
* data_list: List of expression data matrices
* data_id: List of expression data matrices’ name in the same order. *
Genesets: List of pathways * min.size: Minimum size of genes in
pathways/Genesets, Default is 10
* max.size: Maximum size of genes in pathways/Genesets, Default is
500
* min_Sample: filter out genes which are not expressedin at least
min_Sample cells, Default is 5.
* min_Gene: Filter out those cells which do not express at least
min_Gene genes, Default is 1500. * Parallel_threads : Number of threads
in parallel to execute process
invisible({capture.output({
Pathway_score = unCTC::PathwayEnrichmentScore(data_list =dataList,
data_id = dataId,
Genesets = genesets,
min.size=10,
max.size = 500,
min_Sample = 5,
min_Gene = 1500,
Parallel_threads=8L)
})})
For the above pathway enrichment score matrix, we calculate the number of clusters using the Elbow method.
library(factoextra)
library(NbClust)
fviz_nbclust(Pathway_score$Pathway_score, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
DDLk_Clust need the following inputs
#Retrive information about the version of python being used by reticulate
#reticulate::py_config()
#If version is different from the the given path then restart session and
#give path again can change path
DDLK_Clusters = DDLK_Clust(PathwayScore = Pathway_score$Pathway_score,
PathwayMetaData = Pathway_score$Pathway_metadata,
n = 4,
out.dir = getwd(),
MetaData = MetaData
)
unCTC_plots Plots principal components of pathway enrichment score.
Required input for unCTC_plots method is:
plots = unCTC_pathway_plots(Pathway_score = DDLK_Clusters$Pathway_score,
Pathway_metadata = DDLK_Clusters$PathwayDDLK_clust,
colorby = "Data_id",
Color_cluster = "Clusters",
pairsplotLegend = "none")
plots$group_by_Class_PCA
plots$group_by_Cluster_PCA
plots$group_by_Class_umap
plots$group_by_Cluster_umap
library(ggplot2)
ggplot(DDLK_Clusters$PathwayDDLK_clust, aes(x=Clusters, fill = Cell_type))+
theme_classic()+
geom_bar(stat="count")+
scale_color_manual()+
scale_fill_manual(
values = c("dodgerblue4","firebrick3","darkgreen","dark turquoise"))
Provide differential genes between given groups.
Differential_genes require following inputs:
Diff_matrix = unCTC::Differential_genes(data_list=dataList,
min_Sample = 5,
min_Gene = 1500,
DDLK_Clusters,
Genesets = genesets,
data_id=dataId,
data_type = "Normalised",
DifferentiateBy = "Clusters",
up_gene_number = 200)
library(pheatmap)
library(viridis)
annotation = Diff_matrix$annotations
annotation$Data_id <- NULL
annotation$GroupID <- NULL
annotation$Cell_type <- NULL
ann = annotation[,c("HormoneStatus","Class","Clusters")]
pheatmap(t(scale(t(Diff_matrix$DiffMat))),cluster_cols = FALSE,
show_colnames = FALSE,cluster_rows = FALSE, show_rownames = FALSE,
color = viridis(1000),annotation = ann)
Provide differential pathways between given groups.
Differential_pathways require following inputs:
Diff_path = unCTC::Differential_pathways(Pathway_score,
DDLK_Clusters = DDLK_Clusters,
DifferentiateBy = "Clusters",
up_pathways_number = 100
)
annotation = Diff_path$annotations
annotation$Data_id <- NULL
annotation$GroupID <- NULL
annotation$Cell_type <- NULL
ann = annotation[,c("HormoneStatus","Class","Clusters")]
Specific_pathways = c("BIOCARTA_THELPER_PATHWAY","BIOCARTA_TCYTOTOXIC_PATHWAY","BIOCARTA_CTL_PATHWAY","BIOCARTA_IL17_PATHWAY","BIOCARTA_CTLA4_PATHWAY","LEE_DIFFERENTIATING_T_LYMPHOCYTE","BIOCARTA_MONOCYTE_PATHWAY",
"ZHENG_FOXP3_TARGETS_IN_T_LYMPHOCYTE_DN","SPIELMAN_LYMPHOBLAST_EUROPEAN_VS_ASIAN_2FC_DN","GUTIERREZ_CHRONIC_LYMPHOCYTIC_LEUKEMIA_UP","GOERING_BLOOD_HDL_CHOLESTEROL_QTL_CIS",
"FARMER_BREAST_CANCER_CLUSTER_6","TURASHVILI_BREAST_NORMAL_DUCTAL_VS_LOBULAR_UP","YANG_BREAST_CANCER_ESR1_BULK_UP","NIKOLSKY_BREAST_CANCER_19Q13.1_AMPLICON",
"GINESTIER_BREAST_CANCER_ZNF217_AMPLIFIED_UP","HOLLERN_SOLID_NODULAR_BREAST_TUMOR_UP","FINETTI_BREAST_CANCER_KINOME_RED","NIKOLSKY_BREAST_CANCER_7Q21_Q22_AMPLICON",
"YANG_BREAST_CANCER_ESR1_UP","GINESTIER_BREAST_CANCER_ZNF217_AMPLIFIED_UP","WP_MAMMARY_GLAND_DEVELOPMENT_PATHWAY_INVOLUTION_STAGE_4_OF_4","MACLACHLAN_BRCA1_TARGETS_UP",
"REACTOME_ERBB2_ACTIVATES_PTK6_SIGNALING","REACTOME_PI3K_EVENTS_IN_ERBB2_SIGNALING","REACTOME_PI3K_EVENTS_IN_ERBB4_SIGNALING","REACTOME_SHC1_EVENTS_IN_ERBB2_SIGNALING","REACTOME_GRB2_EVENTS_IN_ERBB2_SIGNALING")
mat = Diff_path$DiffMatpathway[Specific_pathways,]
pheatmap(t(scale(t(mat))),cluster_cols = FALSE,
show_colnames = FALSE,cluster_rows = FALSE, show_rownames = TRUE, fontsize_row =7,fontsize = 7,
color = viridis(1000),annotation = ann)
Stouffer_score method uses the following steps:
The followings input are required to calculate Stouffer’s score:
With this package, we have given two types of gene list:
load("~/Supplemental_code/unCTC_datasets/Breast_elevated_genes.RData")
load("~/Supplemental_code/unCTC_datasets/Blood_specific_gene.RData")
#Calculate Stouffer's score for Blood gene
S_WBC = Stouffer_score(data_list = dataList,
min_Sample = 5,
min_Gene = 1500,
gene_list =Blood_specific_gene,
data_id = dataId,
Groupby = "Clusters",
DDLKCluster_data = DDLK_Clusters)
#Calculate Stouffer's score for Breast elevated genes
S_Breast = Stouffer_score(data_list = dataList,
min_Sample = 5,
min_Gene = 1500,
gene_list = Breast_elevated_genes,
data_id = dataId,
Groupby = "Clusters",
DDLKCluster_data = DDLK_Clusters)
For better colour visualization we are using following color key:
library(ggplot2)
library(ggpubr)
ColorKey = c("darkred","deepskyblue3","darkolivegreen4",
"dark turquoise","pale violet red",
"steelblue","forestgreen","gray2",
"gray50","hotpink","lightslateblue",
"tan4","yellow3","sienna4","orchid4")
ggplot(S_WBC$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
geom_boxplot(outlier.shape = NA) + theme_classic() +
scale_fill_manual(values = ColorKey) +
ggtitle("Immune gene signature")+
stat_compare_means(comparisons = S_WBC$comparisons,
label = "p.signif", method = "t.test",ref.group = ".all.")
ggplot(S_Breast$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
geom_boxplot(outlier.shape = NA) + theme_classic() +
scale_fill_manual(values = ColorKey)+
ggtitle("Breast elevated gene signature")+
stat_compare_means(comparisons = S_Breast$comparisons,
label = "p.signif", method = "t.test",ref.group = ".all.")
inferCNV R package is used for analysing copy number variation for raw Count/TPM data. Along with all analysis of inferCNV, unCTC::CNV_alterations calculate addition and deletion position in p and q arms in test/cancerous/ diseased data as compared to reference/normal/healthy. To calculate p and q arm location from inferCNV events, we used GRCh37 cytoband information.
CNV_alterations require the following inputs:
gencode_v19_gene_pos =unCTC::gencode_v19_gene_pos
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._PBMC_CountData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._PBMC_metaData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._CountData.RData")
load("~/Supplemental_code/unCTC_datasets/Poonia_et_al._metaData.RData")
ref_data =Poonia_et_al._PBMC_CountData
ref_metadata = Poonia_et_al._PBMC_metaData
obs_data = Poonia_et_al._CountData
obs_metadata = Poonia_et_al._metaData
dataList1 = list(ref_data,obs_data)
dataId1 = list("WBC","CTC")
MetaData1= list(ref_metadata,obs_metadata)
CNV_Alterations1 = CNV_alterations(
data_list= dataList1,
data_id= dataId1,
min_Sample = 5,
min_Gene = 1500,
path= getwd(),
GenePositionFile= gencode_v19_gene_pos,
threads_no=8,
MetaData = MetaData1,
Groupby = "GroupID",
Reference_name = c("WBC"),
obs.title ="Observations",
ref.title = "References",
cluster_by_groups = TRUE,
out.Filename = "inferCNV"
)
load("/home/saritap/unCTC_datasets/GSE181279_Countdata.RData")
ref_data = GSE181279_Countdata
load("/home/saritap/unCTC_datasets/Ebright_et_al._CountData.RData")
obs_data = Ebright_et_al._CountData
dataList = list(ref_data,obs_data)
dataId = list("WBC","CTC")
CNV_Alterations2 = CNV_alterations(
data_list= dataList,
data_id= dataId,
min_Sample = 5,
min_Gene = 1500,
path= getwd(),
GenePositionFile= gencode_v19_gene_pos,
threads_no=8,
Groupby = "Data_id",
Reference_name = c("WBC"), # WBC data as reference
obs.title ="Observations",
ref.title = "References",
out.Filename = "inferCNV1"
)
Give violin plot for a given Canonical marker expression.
Gene_Violin_plots require input:
* data_list: List of expression data matrices
* data_id: List of expression data matrices name in the same
order.
* min_Sample: filter out genes which are not expressedin at least
min_Sample cells, Default is 5.
* min_Gene: Filter out those cells which do not express at least
min_Gene genes, Default is 1500. * gene_symbol: Specific gene for which
we want to see expression.
* MetaData: Optional, list of metadata of expression matrices. If given
then columns of all metadata in the list must be identical.
* Groupby: Any column name from MetaData, which we want to use to see
differential expression of the gene. Default is “data_id”.
# Gene Violin plot
PTPRC = Gene_Violin_plots(data_list =dataList,
data_id = dataId,
min_Sample = 5,
min_Gene = 1500,
gene_symbol = "PTPRC",
DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
Groupby = "Clusters")
NKG7 = Gene_Violin_plots(data_list =dataList,
data_id = dataId,
min_Sample = 5,
min_Gene = 1500,
gene_symbol = "NKG7",
DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
Groupby = "Clusters")
EPCAM = Gene_Violin_plots(data_list =dataList,
data_id = dataId,
min_Sample = 5,
min_Gene = 1500,
gene_symbol = "EPCAM",
DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
Groupby = "Clusters")
KRT18 = Gene_Violin_plots(data_list =dataList,
data_id = dataId,
min_Sample = 5,
min_Gene = 1500,
gene_symbol = "KRT18",
DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
Groupby = "Clusters")
library(cowplot)
PTPRC$Violin_plot
NKG7$Violin_plot
EPCAM$Violin_plot
KRT18$Violin_plot
aricode::ARI(DDLK_Clusters$PathwayDDLK_clust$Clusters,DDLK_Clusters$PathwayDDLK_clust$Cell_type)
## [1] 0.570144
aricode::NMI(DDLK_Clusters$PathwayDDLK_clust$Clusters,DDLK_Clusters$PathwayDDLK_clust$Cell_type)
## [1] 0.47599
ClusterPurity <- function(clusters, classes) {
sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
ClusterPurity(DDLK_Clusters$PathwayDDLK_clust$Clusters,DDLK_Clusters$PathwayDDLK_clust$Cell_type)
## [1] 0.982823