library(Seurat)
library(dplyr)
library(tidyverse)
library(zellkonverter)

#tpm_combined<-read.table("/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/breast_cancer_TNBC/total_sample/tpm_combined.mat")
tpm_combined<-read.table("/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/esophageal_cancer/total_sample/count_merged.mat")
tpm_combined%>%is.data.frame()
# Initialize a dataframe with genes as rows
expression_matrix <- data.frame(row.names = rownames(tpm_combined))
colnames(tpm_combined)%>%length()
# Create a metadata dataframe
metadata <- data.frame(sample_id = character(), batch = character(), stringsAsFactors = FALSE)
tpm_combined%>%ncol()
t_tpm_combined<-tpm_combined%>%as.tibble()%>%gather(cell_id,TPM,1:368)

met_df<-read_tsv("/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/esophageal_cancer/inferCNVpy/metadata.txt")
# Read TPM values from each RSEM file and assign batch information

t_tpm_combined <- tpm_combined %>%
  rownames_to_column("gene_id") %>%
  pivot_longer(cols = -gene_id, names_to = "cell_id", values_to = "TPM")


t_tpm_combined%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/esophageal_cancer/inferCNVpy/t_count_combined.txt",
              sep="\t",
              quote=F,
              row.names=F)


t_tpm_combined%>%
  filter(gene_id%in%c("APOBEC3A","APOBEC3B"))%>%
  arrange(-TPM)%>%
  write.table("/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/esophageal_cancer/inferCNVpy/APOBEC.eso.txt",
              sep="\t",
              quote=F,
              row.names=F)


# Build expression matrix: wide format, gene x cell
expression_matrix <- t_tpm_combined %>%
  pivot_wider(names_from = cell_id, values_from = TPM) %>%
  column_to_rownames("gene_id") %>%
  as.data.frame()

# Join metadata efficiently
metadata <- met_df %>%
  filter(cell_id %in% colnames(expression_matrix)) %>%
  select(cell_id, tissue) %>%
  rename(sample_id = cell_id, batch = tissue)

for (id in t_tpm_combined$cell_id%>%unique()) {
  df<-t_tpm_combined%>%filter(cell_id==id)
  # Add expression data
  expression_matrix[[id]] <- df$TPM  # Use TPM values

  # Assign batch information (modify as needed, e.g., based on sample naming convention)

  batch_info <- (met_df%>%filter(cell_id==id))$tissue


  # Store metadata
  metadata <- rbind(metadata, data.frame(sample_id = id , batch = batch_info, stringsAsFactors = FALSE))
}

*-*expression_matrix
# Convert to a Seurat object
library(Seurat)
expression_matrix
seurat_obj <- CreateSeuratObject(counts = expression_matrix, assay = "RNA")

# Add metadata to Seurat object
seurat_obj@meta.data$batch <- metadata$batch
seurat_obj@meta.data$sample_id <- metadata$sample_id


seurat_obj<-NormalizeData(seurat_obj, normalization.method = "LogNormalize", scale.factor = 10000)
#seurat_obj <- NormalizeData(seurat_obj)

VariableFeatures(seurat_obj) <- split(row.names(seurat_obj@meta.data), seurat_obj@meta.data$batch) %>% lapply(function(cells_use) {
  seurat_obj[,cells_use] %>%
    FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>%
    VariableFeatures()
}) %>% unlist %>% unique

seurat_obj <- FindVariableFeatures(seurat_obj,selection.method = "vst", nfeatures = 2000)

all.genes<-rownames(seurat_obj)
seurat_obj <- ScaleData(seurat_obj,features=all.genes)%>%
  RunPCA(features = VariableFeatures(object = seurat_obj),npcs=20,verbose=FALSE)

library(harmony)
seurat_obj <- seurat_obj %>%  RunHarmony("batch", plot_convergence = TRUE, nclust = 50, max_iter = 10, early_stop = T)



seurat_obj <- seurat_obj %>%
  FindNeighbors(reduction = "harmony") %>%
  FindClusters(resolution = 0.5)
seurat_obj <- seurat_obj %>%
  FindNeighbors(reduction = "harmony") %>%
  FindClusters(resolution = 0.7)
seurat_obj <- seurat_obj %>%
  RunUMAP(reduction = "harmony",  dims = 1:20)
library(cowplot)
p1 <- DimPlot(seurat_obj, reduction = "umap", group.by = "batch", pt.size = .1)
p2 <- DimPlot(seurat_obj, reduction = "umap", label = TRUE,  pt.size = .1)
plot_grid(p1, p2)

DimPlot(seurat_obj, reduction = "pca")
ElbowPlot(seurat_obj)
seurat_obj@meta.data
#raw_counts<-LayerData(seurat_obj,assay="RNA",layer="counts")
#norm_counts <- LayerData(seurat_obj, assay = "RNA", layer = 'data')

seurat_obj <- RunPCA(seurat_obj,features=VariableFeatures(seurat_obj),npcs=20,verbos=FALSE)
#seurat_obj <- RunUMAP(seurat_obj,dims=1:10)
seurat_obj <- FindNeighbors(seurat_obj, dims = 1:10)
seurat_obj <- FindClusters(seurat_obj, resolution = 0.1)

seurat_obj <- FindClusters(seurat_obj, resolution = 0.5)

seurat_obj <- FindClusters(seurat_obj, resolution = 0.6)
seurat_obj <- FindClusters(seurat_obj, resolution = 0.7)
seurat_obj <- FindClusters(seurat_obj, resolution = 0.9)
seurat_obj@meta.data
dim_p1<-DimPlot(seurat_obj, group.by = "RNA_snn_res.0.1",label = TRUE, repel = TRUE)
dim_p2<-DimPlot(seurat_obj, group.by = "RNA_snn_res.0.5",label = TRUE, repel = TRUE)
dim_p4<-DimPlot(seurat_obj, group.by = "RNA_snn_res.0.6",label = TRUE, repel = TRUE)
dim_p3<-DimPlot(seurat_obj, group.by = "RNA_snn_res.0.7",label = TRUE, repel = TRUE)
#dim_p5<-DimPlot(seurat_obj, group.by = "RNA_snn_res.0.8",label = TRUE, repel = TRUE)
dim_p6<-DimPlot(seurat_obj, group.by = "RNA_snn_res.0.9",label = TRUE, repel = TRUE)
dim_p1
dim_p2
dim_p3
dim_p4
#dim_p5
dim_p6
f_p<-FeaturePlot(seurat_obj,  features = c("PTPRC", "CD3E", "CD79A", "CD68", "CD34", "VWF", "EPCAM","KRT19" ,"FAP", "COL1A1", "KRT18", "PMEL","MLANA","MKI67","ALB","TPSAB1", "TPSB2","TP63","KRT5","KRT14","CD44"))
f_p2<-FeaturePlot(seurat_obj,  features = c("CD24A", "CD227", "CD340", "ALDH1", "CD49f", "CD44","MUC1","COL17A1"))
f_p
f_p2
dim_p2
plot_grid(dim_p1,dim_p2)
plot_grid(dim_p1,dim_p2,f_p)
table(seurat_obj$RNA_snn_res.0.1)
#new.cluster.ids
#conversion_map<-c(0="C0",1="C1",2="C2",3="C3",4="C4",5="C5")
#t<-conversion_map[seurat_obj@meta.data$RNA_snn_res.0.5]
#seurat_obj@meta.data$RNA_snn_res.0.5<-factor(paste0("C",as.character(seurat_obj@meta.data$RNA_snn_res.0.5)),levels=c("C0","C1","C2","C3","C4","C5"))



xxx <- FindAllMarkers(seurat_obj, group.by = "RNA_snn_res.0.5")
xxx2 <- FindAllMarkers(seurat_obj, group.by = "RNA_snn_res.0.1")
xxx3 <- FindAllMarkers(seurat_obj, group.by = "RNA_snn_res.0.6")
xxx4 <- FindAllMarkers(seurat_obj, group.by = "RNA_snn_res.0.7")
view(xxx2)
view(xxx3)
view(xxx4)
seurat_obj


# convert a v5 assay to a v3 assay
#SO.ASA17.EP.test[["RNA"]] <- as(object = SO.ASA17.EP.test[["RNA"]], Class = "Assay")
seurat_obj[["RNA"]] <- as(object = seurat_obj[["RNA"]], Class = "Assay")
seurat_obj@meta.data<-cbind(seurat_obj@meta.data,cell_id=paste0("C",seurat_obj@meta.data$seurat_clusters))
seurat_obj@meta.data
seurat_obj@meta.data<-seurat_obj@meta.data%>%
  cbind(cell_id_7=paste0("C",seurat_obj@meta.data$RNA_snn_res.0.7))
meta <- seurat_obj@meta.data
meta$RNA_snn_res.0.9%>%table()
# Assign the new column using base R within groups
meta$cell_id_7_for_infercnv <- meta$cell_id_7  # default value
meta$cell_id_7_for_infercnv%>%
  table()
for (clust in c("C6")) {
  idx <- which(meta$cell_id_7 == clust)
  n_ref <- min(16, length(idx))
  ref_idx <- sample(idx, n_ref)

  meta$cell_id_7_for_infercnv[ref_idx] <- paste0(clust, "_ref")
  meta$cell_id_7_for_infercnv[setdiff(idx, ref_idx)] <- paste0(clust, "_spike")
}

# Assign it back to Seurat object (still a data.frame)
seurat_obj@meta.data <- meta

table(seurat_obj@meta.data$cell_id_1_for_infercnv)


seurat_obj@meta.data<-seurat_obj@meta.data[-12]
DefaultAssay(seurat_obj) <-  'RNA' # temporarily making 'RNA' active assay
seu = DietSeurat(
  seurat_obj,
  counts = TRUE, # so, raw counts save to adata.raw.X
  data = TRUE, # so, log1p counts save to adata.X
  scale.data = FALSE, # set to false, or else will save to adata.X
  features = rownames(seurat_obj), # export all genes, not just top highly variable genes
  assays = c("RNA"),
  dimreducs = c("umap", "pca","harmony"),
  graphs = c("RNA_nn", "RNA_snn")#, # to RNA_nn -> distances, RNA_snn -> connectivities
  #  misc = TRUE
)
seurat_obj@meta.data$cell_id_7_for_infercnv%>%table()
library(SeuratDisk)
SaveH5Seurat(seurat_obj, filename = "/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/esophageal_cancer/inferCNVpy/eso.h5Seurat",overwrite = T)
Convert("/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/esophageal_cancer/inferCNVpy/eso.h5Seurat", dest = "h5ad",overwrite = T)
# umap 정보는 export가 안되서, 따로 빼줘서 scanpy에서 붙여야 한다.
K <- as.data.frame(seu@reductions$umap@cell.embeddings)
K
#seu@reductions$harmony@
write.csv(K, "/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/esophageal_cancer/inferCNVpy/eso.h5Seurat.h5Seurat.umap3.csv")


library(DoubletFinder)
## pK Identification (no ground-truth) ---------------------------------------------------------------------------------------
sweep.res.list_kidney <- paramSweep(seurat_obj, PCs = 1:10, sct = FALSE)
sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE)
bcmvn_kidney <- find.pK(sweep.stats_kidney)


## Homotypic Doublet Proportion Estimate -------------------------------------------------------------------------------------
homotypic.prop <- modelHomotypic(seurat_obj@meta.data$RNA_snn_res.0.5)           ## ex: annotations <- seu_kidney@meta.data$ClusteringResults
nExp_poi <- round(0.075*nrow(seurat_obj@meta.data))  ## Assuming 7.5% doublet formation rate - tailor for your dataset
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))

## Run DoubletFinder with varying classification stringencies ----------------------------------------------------------------
seu_kidney <- doubletFinder(seurat_obj, PCs = 1:10, pN = 0.25, pK = 0.09, nExp = nExp_poi, reuse.pANN = NULL, sct = FALSE)
seu_kidney <- doubletFinder(seurat_obj, PCs = 1:10, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = "pANN_0.25_0.09_913", sct = FALSE)

xxx <- FindAllMarkers(seurat_obj, group.by = "RNA_snn_res.0.5")

seurat_obj


# convert a v5 assay to a v3 assay
#SO.ASA17.EP.test[["RNA"]] <- as(object = SO.ASA17.EP.test[["RNA"]], Class = "Assay")
seurat_obj[["RNA"]] <- as(object = seurat_obj[["RNA"]], Class = "Assay")
seurat_obj@meta.data<-cbind(seurat_obj@meta.data,cell_id=paste0("C",seurat_obj@meta.data$seurat_clusters))
seurat_obj@meta.data$cell_id_7_for_infercnv%>%table()
DefaultAssay(seurat_obj) <-  'RNA' # temporarily making 'RNA' active assay
seu = DietSeurat(
  seurat_obj,
  counts = TRUE, # so, raw counts save to adata.raw.X
  data = TRUE, # so, log1p counts save to adata.X
  scale.data = FALSE, # set to false, or else will save to adata.X
  features = rownames(seurat_obj), # export all genes, not just top highly variable genes
  assays = c("RNA"),
  dimreducs = c("umap", "pca","harmony"),
  graphs = c("RNA_nn", "RNA_snn")#, # to RNA_nn -> distances, RNA_snn -> connectivities
  #  misc = TRUE
)

library(SeuratDisk)
SaveH5Seurat(seurat_obj, filename = "/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/esophageal_cancer/inferCNVpy/TNBC.h5Seurat",overwrite = T)
Convert("/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/esophageal_cancer/inferCNVpy/TNBC.h5Seurat", dest = "h5ad",overwrite = T)
# umap 정보는 export가 안되서, 따로 빼줘서 scanpy에서 붙여야 한다.
K <- as.data.frame(seu@reductions$umap@cell.embeddings)
#seu@reductions$harmony@
write.csv(K, "/home/users/ayh/Projects/27_A3B/07_revision/scRNA_seq/esophageal_cancer/inferCNVpy/TNBC.h5Seurat.umap3.csv")

library(Seurat)
library(zellkonverter)
library(SingleCellExperiment)
library(SeuratDisk)
# Convert Seurat object to AnnData format and save as .h5ad

seurat_obj[["RNA"]] <- as(object = seurat_obj[["RNA"]], Class = "Assay")
seurat_obj@reductions

adata <- SingleCellExperiment::S(sce_obj)

# Ensure the 'batch' column is present in adata.obs
adata$obs$batch <- metadata$batch
adata$obs$sample_id <- metadata$sample_id



# Save to .h5ad file
zellkonverter::writeH5AD(adata, file = "rsem_infercnv.h5ad")

cat("AnnData object saved as rsem_infercnv.h5ad with batch information\n")
