##
## The following notebook was used to plot the distribution of cell types/states and sparsity across used datasets. 
## 
##

import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

path = 'ANS_supplementary_information/reproduce_project/data/preprocessed/'
outputPath = <output_path>

# BRCA:
breast = sc.read_h5ad(path + 'pp_breast_malignant.h5ad')

cell_counts = breast.obs['gene_module'].value_counts()
cell_props = cell_counts / breast.n_obs

cell_props_df = pd.DataFrame({
    "Gene module": cell_props.index,
    "Proportion": cell_props.values
})

print(cell_props_df)

sparsity_overall = 1 - (breast.X.nnz / np.prod(breast.shape))
print(f"Overall sparsity: {sparsity_overall:.2%}")

sparsity_list = []
for ct in breast.obs['gene_module'].unique():
    subset = breast[breast.obs['gene_module'] == ct]
    sparsity = 1 - (subset.X.nnz / np.prod(subset.shape))
    sparsity_list.append({"Gene module": ct, "Sparsity": sparsity})

sparsity_df = pd.DataFrame(sparsity_list)

df_long = pd.concat([
    cell_props_df.rename(columns={"Proportion": "Value"}).assign(Metric="Proportion"),
    sparsity_df.rename(columns={"Sparsity": "Value"}).assign(Metric="Sparsity")
], ignore_index=True)


plt.figure(figsize=(8,5))
sns.barplot(
    data=df_long,
    x="Gene module",
    y="Value",
    hue="Metric",
    palette=["skyblue", "lightcoral"]
)


plt.ylabel("Value", size=16)
plt.xlabel("Gene module", size=16)
plt.title("Cell type composition and sparsity of BRCA dataset", size=16)
plt.xticks(rotation=45, ha='right', size=16)
plt.yticks(size=16)
plt.legend(title="Metric", title_fontsize=16, fontsize=16)
plt.tight_layout()
plt.savefig(outputPath + "BRCA_dataset.svg", format='svg', bbox_inches='tight')
plt.show()


# LUAD:
luad = sc.read_h5ad(path + 'pp_luad_kim_malignant.h5ad')


cell_counts = luad.obs['Cell_subtype'].value_counts()
cell_props = cell_counts / luad.n_obs

cell_props_df = pd.DataFrame({
    "Cell subtype": cell_props.index,
    "Proportion": cell_props.values
})

print(cell_props_df)

sparsity_overall = 1 - (luad.X.nnz / np.prod(luad.shape))
print(f"Overall sparsity: {sparsity_overall:.2%}")

sparsity_list = []
for ct in luad.obs['Cell_subtype'].unique():
    subset = luad[luad.obs['Cell_subtype'] == ct]
    sparsity = 1 - (subset.X.nnz / np.prod(subset.shape))
    sparsity_list.append({"Cell subtype": ct, "Sparsity": sparsity})

sparsity_df = pd.DataFrame(sparsity_list)


df_long = pd.concat([
    cell_props_df.rename(columns={"Proportion": "Value"}).assign(Metric="Proportion"),
    sparsity_df.rename(columns={"Sparsity": "Value"}).assign(Metric="Sparsity")
], ignore_index=True)


plt.figure(figsize=(8,5))
sns.barplot(
    data=df_long,
    x="Cell subtype",
    y="Value",
    hue="Metric",
    palette=["skyblue", "lightcoral"]
)


plt.ylabel("Value", size=16)
plt.xlabel("Cell subtype", size=16)
plt.title("Cell type composition and sparsity of LUAD Kim dataset", size=16)
plt.xticks(rotation=45, ha='right', size=16)
plt.yticks(size=16)
plt.legend(title="Metric", title_fontsize=16, fontsize=16)
plt.tight_layout()
plt.savefig(outputPath + "LUAD_dataset.svg", format='svg', bbox_inches='tight')
plt.show()

#Ovarian:
ov = sc.read_h5ad(path + 'pp_ovarian_malignant.h5ad')


cell_counts = ov.obs['cluster_label'].value_counts()
cell_props = cell_counts / ov.n_obs

cell_props_df = pd.DataFrame({
    "Cluster label": cell_props.index,
    "Proportion": cell_props.values
})

print(cell_props_df)

sparsity_overall = 1 - (ov.X.nnz / np.prod(ov.shape))
print(f"Overall sparsity: {sparsity_overall:.2%}")

sparsity_list = []
for ct in ov.obs['cluster_label'].unique():
    subset = ov[ov.obs['cluster_label'] == ct]
    sparsity = 1 - (subset.X.nnz / np.prod(subset.shape))
    sparsity_list.append({"Cluster label": ct, "Sparsity": sparsity})

sparsity_df = pd.DataFrame(sparsity_list)


df_long = pd.concat([
    cell_props_df.rename(columns={"Proportion": "Value"}).assign(Metric="Proportion"),
    sparsity_df.rename(columns={"Sparsity": "Value"}).assign(Metric="Sparsity")
], ignore_index=True)


plt.figure(figsize=(8,5))
sns.barplot(
    data=df_long,
    x="Cluster label",
    y="Value",
    hue="Metric",
    palette=["skyblue", "lightcoral"]
)


plt.ylabel("Value", size=16)
plt.xlabel("Cluster label", size=16)
plt.title("Cell type composition and sparsity of HGSOC dataset", size=16)
plt.xticks(rotation=45, ha='right', size=16)
plt.yticks(size=16)
plt.legend(title="Metric", title_fontsize=16, fontsize=16)
plt.tight_layout()
plt.savefig(outputPath + "HGSOC_dataset.svg", format='svg', bbox_inches='tight')
plt.show()


#Skin dataset:
skin = sc.read_h5ad(path + 'pp_skin_malignant.h5ad')


cell_counts = skin.obs['level2_celltype'].value_counts()
cell_props = cell_counts / skin.n_obs

cell_props_df = pd.DataFrame({
    "Cluster label": cell_props.index,
    "Proportion": cell_props.values
})

print(cell_props_df)

sparsity_overall = 1 - (skin.X.nnz / np.prod(skin.shape))
print(f"Overall sparsity: {sparsity_overall:.2%}")

sparsity_list = []
for ct in skin.obs['level2_celltype'].unique():
    subset = skin[skin.obs['level2_celltype'] == ct]
    sparsity = 1 - (subset.X.nnz / np.prod(subset.shape))
    sparsity_list.append({"Cluster label": ct, "Sparsity": sparsity})

sparsity_df = pd.DataFrame(sparsity_list)
sparsity_df

df_long = pd.concat([
    cell_props_df.rename(columns={"Proportion": "Value"}).assign(Metric="Proportion"),
    sparsity_df.rename(columns={"Sparsity": "Value"}).assign(Metric="Sparsity")
], ignore_index=True)

df_long

plt.figure(figsize=(8,5))
sns.barplot(
    data=df_long,
    x="Cluster label",
    y="Value",
    hue="Metric",
    palette=["skyblue", "lightcoral"]
)

plt.ylabel("Value", size=16)
plt.xlabel("Cluster label", size=16)
plt.title("Cell type composition and sparsity of sCC dataset", size=16)
plt.xticks(rotation=45, ha='right', size=16)
plt.yticks(size=16)
plt.legend(title="Metric", title_fontsize=16, fontsize=16)
plt.tight_layout()
plt.savefig(outputPath + "sCC_dataset.svg", format='svg', bbox_inches='tight')
plt.show()


#B, NK, Mono:
bmn = sc.read_h5ad(path + 'pp_pbmc_b_mono_nk.h5ad')


cell_counts = bmn.obs['celltype.l1'].value_counts()
cell_props = cell_counts / bmn.n_obs

cell_props_df = pd.DataFrame({
    "Cell type": cell_props.index,
    "Proportion": cell_props.values
})

print(cell_props_df)

sparsity_overall = 1 - (bmn.X.nnz / np.prod(bmn.shape))
print(f"Overall sparsity: {sparsity_overall:.2%}")

sparsity_list = []
for ct in bmn.obs['celltype.l1'].unique():
    subset = bmn[bmn.obs['celltype.l1'] == ct]
    sparsity = 1 - (subset.X.nnz / np.prod(subset.shape))
    sparsity_list.append({"Cell type": ct, "Sparsity": sparsity})

sparsity_df = pd.DataFrame(sparsity_list)
sparsity_df

df_long = pd.concat([
    cell_props_df.rename(columns={"Proportion": "Value"}).assign(Metric="Proportion"),
    sparsity_df.rename(columns={"Sparsity": "Value"}).assign(Metric="Sparsity")
], ignore_index=True)

df_long

plt.figure(figsize=(8,5))
sns.barplot(
    data=df_long,
    x="Cell type",
    y="Value",
    hue="Metric",
    palette=["skyblue", "lightcoral"]
)


plt.ylabel("Value", size=16)
plt.xlabel("Cell type", size=16)
plt.title("Cell type composition and sparsity of B, NK and Monocyte dataset", size=16)
plt.xticks(rotation=45, ha='right', size=16)
plt.yticks(size=16)
plt.legend(title="Metric", title_fontsize=16, fontsize=16)
plt.tight_layout()
plt.savefig(outputPath + "BNKM_dataset.svg", format='svg', bbox_inches='tight')
plt.show()

#B subtypes (3): 
b3 = sc.read_h5ad(path + 'pp_pbmc_b_subtypes.h5ad')


cell_counts = b3.obs['celltype.l2'].value_counts()
cell_props = cell_counts / b3.n_obs

cell_props_df = pd.DataFrame({
    "Cell type": cell_props.index,
    "Proportion": cell_props.values
})

print(cell_props_df)

sparsity_overall = 1 - (b3.X.nnz / np.prod(b3.shape))
print(f"Overall sparsity: {sparsity_overall:.2%}")

sparsity_list = []
for ct in b3.obs['celltype.l2'].unique():
    subset = b3[b3.obs['celltype.l2'] == ct]
    sparsity = 1 - (subset.X.nnz / np.prod(subset.shape))
    sparsity_list.append({"Cell type": ct, "Sparsity": sparsity})

sparsity_df = pd.DataFrame(sparsity_list)
sparsity_df

df_long = pd.concat([
    cell_props_df.rename(columns={"Proportion": "Value"}).assign(Metric="Proportion"),
    sparsity_df.rename(columns={"Sparsity": "Value"}).assign(Metric="Sparsity")
], ignore_index=True)

df_long

plt.figure(figsize=(8,5))
sns.barplot(
    data=df_long,
    x="Cell type",
    y="Value",
    hue="Metric",
    palette=["skyblue", "lightcoral"]
)

plt.ylabel("Value", size=16)
plt.xlabel("Cell type", size=16)
plt.title("Cell type composition and sparsity of B cell dataset (3 subtypes)", size=16)
plt.xticks(rotation=45, ha='right', size=16)
plt.yticks(size=16)
plt.legend(title="Metric", title_fontsize=16, fontsize=16)
plt.tight_layout()
plt.savefig(outputPath + "B3_dataset.svg", format='svg', bbox_inches='tight')
plt.show()

#B 6 subtypes:

cell_counts = b3.obs['celltype.l3'].value_counts()
cell_props = cell_counts / b3.n_obs

cell_props_df = pd.DataFrame({
    "Cell type": cell_props.index,
    "Proportion": cell_props.values
})

print(cell_props_df)

sparsity_overall = 1 - (b3.X.nnz / np.prod(b3.shape))
print(f"Overall sparsity: {sparsity_overall:.2%}")

sparsity_list = []
for ct in b3.obs['celltype.l3'].unique():
    subset = b3[b3.obs['celltype.l3'] == ct]
    sparsity = 1 - (subset.X.nnz / np.prod(subset.shape))
    sparsity_list.append({"Cell type": ct, "Sparsity": sparsity})

sparsity_df = pd.DataFrame(sparsity_list)
sparsity_df

df_long = pd.concat([
    cell_props_df.rename(columns={"Proportion": "Value"}).assign(Metric="Proportion"),
    sparsity_df.rename(columns={"Sparsity": "Value"}).assign(Metric="Sparsity")
], ignore_index=True)

df_long
# Plot side-by-side bars
plt.figure(figsize=(8,5))
sns.barplot(
    data=df_long,
    x="Cell type",
    y="Value",
    hue="Metric",
    palette=["skyblue", "lightcoral"]
)


plt.ylabel("Value", size=16)
plt.xlabel("Cell type", size=16)
plt.title("Cell type composition and sparsity of B cell dataset (6 subtypes)", size=16)
plt.xticks(rotation=45, ha='right', size=16)
plt.yticks(size=16)
plt.legend(title="Metric", title_fontsize=16, fontsize=16)
plt.tight_layout()
plt.savefig(outputPath + "B6_dataset.svg", format='svg', bbox_inches='tight')
plt.show()

#CD4:
cd4 = sc.read_h5ad(path + 'pp_pbmc_cd4_subtypes.h5ad')


cell_counts = cd4.obs['celltype.l2'].value_counts()
cell_props = cell_counts / cd4.n_obs

cell_props_df = pd.DataFrame({
    "Cell type": cell_props.index,
    "Proportion": cell_props.values
})

print(cell_props_df)

sparsity_overall = 1 - (cd4.X.nnz / np.prod(cd4.shape))
print(f"Overall sparsity: {sparsity_overall:.2%}")

sparsity_list = []
for ct in cd4.obs['celltype.l2'].unique():
    subset = cd4[cd4.obs['celltype.l2'] == ct]
    sparsity = 1 - (subset.X.nnz / np.prod(subset.shape))
    sparsity_list.append({"Cell type": ct, "Sparsity": sparsity})

sparsity_df = pd.DataFrame(sparsity_list)
sparsity_df

df_long = pd.concat([
    cell_props_df.rename(columns={"Proportion": "Value"}).assign(Metric="Proportion"),
    sparsity_df.rename(columns={"Sparsity": "Value"}).assign(Metric="Sparsity")
], ignore_index=True)

df_long
# Plot side-by-side bars
plt.figure(figsize=(8,5))
sns.barplot(
    data=df_long,
    x="Cell type",
    y="Value",
    hue="Metric",
    palette=["skyblue", "lightcoral"]
)


plt.ylabel("Value", size=16)
plt.xlabel("Cell type", size=16)
plt.title("Cell type composition and sparsity of CD4 cell dataset (6 subtypes)", size=16)
plt.xticks(rotation=45, ha='right', size=16)
plt.yticks(size=16)
plt.legend(title="Metric", title_fontsize=16, fontsize=16)
plt.tight_layout()
plt.savefig(outputPath + "CD4_dataset.svg", format='svg', bbox_inches='tight')
plt.show()

#CD8:
cd8 = sc.read_h5ad(path + 'pp_pbmc_cd8_subtypes.h5ad')


cell_counts = cd8.obs['celltype.l2'].value_counts()
cell_props = cell_counts / cd8.n_obs

cell_props_df = pd.DataFrame({
    "Cell type": cell_props.index,
    "Proportion": cell_props.values
})

print(cell_props_df)

sparsity_overall = 1 - (cd8.X.nnz / np.prod(cd8.shape))
print(f"Overall sparsity: {sparsity_overall:.2%}")

sparsity_list = []
for ct in cd8.obs['celltype.l2'].unique():
    subset = cd8[cd8.obs['celltype.l2'] == ct]
    sparsity = 1 - (subset.X.nnz / np.prod(subset.shape))
    sparsity_list.append({"Cell type": ct, "Sparsity": sparsity})

sparsity_df = pd.DataFrame(sparsity_list)
sparsity_df

df_long = pd.concat([
    cell_props_df.rename(columns={"Proportion": "Value"}).assign(Metric="Proportion"),
    sparsity_df.rename(columns={"Sparsity": "Value"}).assign(Metric="Sparsity")
], ignore_index=True)

df_long

plt.figure(figsize=(8,5))
sns.barplot(
    data=df_long,
    x="Cell type",
    y="Value",
    hue="Metric",
    palette=["skyblue", "lightcoral"]
)

plt.ylabel("Value", size=16)
plt.xlabel("Cell type", size=16)
plt.title("Cell type composition and sparsity of CD8 cell dataset (4 subtypes)", size=16)
plt.xticks(rotation=45, ha='right', size=16)
plt.yticks(size=16)
plt.legend(title="Metric", title_fontsize=16, fontsize=16)
plt.tight_layout()
plt.savefig(outputPath + "CD8_dataset.svg", format='svg', bbox_inches='tight')
plt.show()

#Neuronal differentiation dataset, produced using "neuronal_differentation_dataset.py" notebook
data = sc.read_h5ad('neuronal_scored.h5ad')

cell_counts = data.obs['celltype'].value_counts()
cell_props = cell_counts / data.n_obs

cell_props_df = pd.DataFrame({
    "Cell type": cell_props.index,
    "Proportion": cell_props.values
})

print(cell_props_df)

sparsity_overall = 1 - (data.X.nnz / np.prod(data.shape))
print(f"Overall sparsity: {sparsity_overall:.2%}")

sparsity_list = []
for ct in data.obs['celltype'].unique():
    subset = data[data.obs['celltype'] == ct]
    sparsity = 1 - (subset.X.nnz / np.prod(subset.shape))
    sparsity_list.append({"Cell type": ct, "Sparsity": sparsity})

sparsity_df = pd.DataFrame(sparsity_list)
sparsity_df

df_long = pd.concat([
    cell_props_df.rename(columns={"Proportion": "Value"}).assign(Metric="Proportion"),
    sparsity_df.rename(columns={"Sparsity": "Value"}).assign(Metric="Sparsity")
], ignore_index=True)

df_long

plt.figure(figsize=(8,5))
sns.barplot(
    data=df_long,
    x="Cell type",
    y="Value",
    hue="Metric",
    palette=["skyblue", "lightcoral"]
)

plt.ylabel("Value", size=16)
plt.xlabel("Cell type", size=16)
plt.title("Cell type composition and sparsity of Neuronal differentiation\ndataset (12 subtypes)", size=16)
plt.xticks(rotation=45, ha='right', size=16)
plt.yticks(size=16)
plt.ylim(0, 1.0)  # Set y-axis limit from 0 to 1
# Set y-ticks at 0.1 intervals
plt.yticks(np.arange(0, 1.1, 0.1), size=16)
plt.legend(title="Metric", title_fontsize=16, fontsize=16)
plt.tight_layout()
plt.savefig(outputPath + "DA_dataset.svg", format='svg', bbox_inches='tight')
plt.show()
