library(Seurat)
library(ggplot2)

setwd("~/Documents/DEGage_stuff/DEGage_Testing/Heidegger")

#Performing preliminary processing, isolating variable features
samples <- list.files("data/")

sample.list <- list()
for (i in 1:length(samples)){
  print(i)
  dir <- paste("data/", samples[i], "/", sep = "")
  sample.list[[i]] <- Read10X(data.dir = dir)
  sample.list[[i]] <- CreateSeuratObject(sample.list[[i]])
}

combined <- merge(sample.list[[1]], y = list(sample.list[[2]],sample.list[[3]],sample.list[[4]],sample.list[[5]],sample.list[[6]],sample.list[[7]],sample.list[[8]]),  add.cell.ids = samples)
combined <- JoinLayers(combined)
rm(sample.list)

combined[["percent.mt"]] <- PercentageFeatureSet(combined, pattern = "^MT-")
combined <- subset(combined, subset = nFeature_RNA > 200 & nFeature_RNA < 8000 & percent.mt < 20)

combined <- NormalizeData(combined)
combined <- ScaleData(combined, features = rownames(combined))
combined <- FindVariableFeatures(object = combined)


#identifying cell types sing SingleR
library(SingleR)
celltypesall <- as.SingleCellExperiment(combined)
hpca.se <- celldex::HumanPrimaryCellAtlasData()
pred.all<- SingleR(test = celltypesall, ref = hpca.se,
                     labels = hpca.se$label.main)

#Filtering out junk cells
pred.table <- table(pred.all$labels)
celllist <- levels(factor(pred.all$labels))
too.few <- vector()
for(i in 1:length(pred.table)){
  if (pred.table[[i]] < 20){
    too.few <- c(too.few, celllist[i])
  }
}
too.few = c(too.few, "iPS_cells")
badcells <- which(pred.all$labels %in% too.few  )
combined <- combined[,-badcells]
pred.all <- pred.all[-badcells,]
combined$celltype <- pred.all$labels
combined <- RunPCA(combined[,1:10000], fastpath = FALSE)
combined <- RunTSNE(combined)

#Cell type annotation


cols<- c("#800000", "#9A6324", "#808000", "#469990",
         "#000075", "#e6194B", "#ffe119", "#bfef45",
         "#3cb44b", "#42d4f4", "#4363d8", "#911eb4",
         "#f032e6", "#fabed4", "#ffd8b1", "#fffac8",
         "#aaffc3")
celltype_labels <- c("B Cells", "Chondrocytes",
                     "CMP", "Dendritic Cells",
                     "Endothelial Cells", "Epithelial Cells",
                     "Fibroblasts", "Hepatocytes",
                     "HSC CD34+ Cells", "Macrophages",
                     "Monocytes", "Neutrophils", "NK Cells",
                     "Pre B Cells CD34-", "Smooth Muscle Cells",
                     "T Cells", "Tissue Stem Cells")

#tsne plot
DimPlot(combined, reduction = "tsne", group.by = 'celltype')+
  scale_color_manual(labels = celltype_labels, values = cols )

#differential expression analysis with DEGage
celltypes <- levels(factor(combined$celltype))
Idents(combined) <- combined$celltype

ni <- which(substr(colnames(combined),3,3) == "n")
ti <- which (substr(colnames(combined),3,3) == "t")
group.pos.neg <- rep(0, ncol(combined))
group.pos.neg[ni] = 1

#Looking at all genes to get background DE expression
library(DEGage)
t.vs.n <- DEGage(as.data.frame(combined@assays$RNA@counts), factor(group.pos.neg), nsubsample = 1000, perm.preprocess =  FALSE)
t.vs.n.noNA <- t.vs.n[!is.na(t.vs.n$FDR),]
t.vs.n.degs <- t.vs.n.noNA[t.vs.n.noNA$pval < 0.05,]
t.vs.n.degs[order(t.vs.n.degs$pval),]

write.csv(t.vs.n, "DEGage_C/Pooled_pvn.csv")

neg_seurat <- combined[,group.pos.neg == 1]
pos_seurat <- combined[,group.pos.neg == 0]

#Getting DE's for every cluster against every cluster
#i = celltype of interest
#j = comparison cluster

#for positive cells only:
for(i in 1:(length(celltypes)-1)){
  for(j in (i+1):length(celltypes)){
    print(i)
    print(j)
    print(celltypes[i])
    print(celltypes[j])
    print("------")

    counts <- cbind(as.data.frame(subset(pos_seurat, idents = celltypes[i])@assays$RNA@counts),
                    as.data.frame(subset(pos_seurat, idents = celltypes[j])@assays$RNA@counts))

    g1 <- rep(0,ncol(subset(pos_seurat, idents = celltypes[i])@assays$RNA@counts))
    g2 <- rep(1,ncol(subset(pos_seurat, idents = celltypes[j])@assays$RNA@counts))
    groups <- factor(c(g1,g2))

    res <- DEGage(counts, groups)

    run <- paste(celltypes[i], "v", celltypes[j], sep= "")
    path <- paste("DEGage_C/pos_celltype_comparisons", run, ".csv", sep = "")
    write.csv(res, path)
  }
}
#For negative cells only:
for(i in 1:(length(celltypes)-1)){
  for(j in (i+1):length(celltypes)){
    print(i)
    print(j)
    print(celltypes[i])
    print(celltypes[j])
    print("------")

    counts <- cbind(as.data.frame(subset(neg_seurat, idents = celltypes[i])@assays$RNA@counts),
                    as.data.frame(subset(neg_seurat, idents = celltypes[j])@assays$RNA@counts))

    g1 <- rep(0,ncol(subset(neg_seurat, idents = celltypes[i])@assays$RNA@counts))
    g2 <- rep(1,ncol(subset(neg_seurat, idents = celltypes[j])@assays$RNA@counts))
    groups <- factor(c(g1,g2))

    res <- DEGage(counts, groups)

    run <- paste(celltypes[i], "v", celltypes[j], sep= "")
    path <- paste("DEGage_C/neg_celltype_comparisons/", run, ".csv", sep = "")
    write.csv(res, path)
  }
}
#Positive against negative cells for every cluster individually
for(i in 1:length(celltypes)){
  print(celltypes[i])

  counts <- cbind(as.data.frame(subset(pos_seurat, idents = celltypes[i])@assays$RNA@counts),
                  as.data.frame(subset(neg_seurat, idents = celltypes[i])@assays$RNA@counts))

  g1 <- rep(0,ncol(subset(pos_seurat, idents = celltypes[i])@assays$RNA@counts))
  g2 <- rep(1,ncol(subset(neg_seurat, idents = celltypes[i])@assays$RNA@counts))
  groups <- factor(c(g1,g2))

  res <- DEGage(counts, groups)

  path <- paste("DEGage_C/pos_v_neg_by_celltype/", celltypes[i], ".csv", sep = "")
  write.csv(res, path)
}

##Generating Supplementary data table

#tumor positive cells
suppdf <- data.frame(matrix(0, ncol = length(celltypes), nrow = length(celltypes)))
colnames(suppdf) <- celltypes
rownames(suppdf) <- celltypes
for(i in 1:(length(celltypes)-1)){
  for(j in (i+1):length(celltypes)){
    print(celltypes[i])
    print(celltypes[j])

    filename <- paste("DEGage_C/pos_celltype_comparisons/",celltypes[i],"v",celltypes[j], ".csv", sep = "")
    df <- read.csv(filename, header = TRUE)
    df <- find_infs(df, 10)
    df <- df[!is.na(df$pval),]
    df <- df[df$FDR < 0.05,]
    genes <- df$X
    genes <- gsub(" ", "" ,genes)
    str <- ""
    for(k in genes){
      str <- paste(str, k, sep = ";")
    }
    str <- substr(str, 2, nchar(str))
    suppdf[i,j] = str
  }
}
write.csv(suppdf, "DEGage_C/supplementary tables/pos_celltype_comparisons_degs.csv")

celltypes <- c("B_cell", "Chondrocytes", "CMP", "DC", "Endothelial_cells", "Epithelial_cells",
              "Fibroblasts", "Hepatocytes", "HSC_CD34+", "Macrophage", "Monocyte",
              "Neutrophils", "NK_cell", "Pre-B_cell_CD34-", "Smooth_muscle_cells",
              "T_cells", "Tissue_stem_cells")

suppdf <- data.frame(matrix(0, ncol = length(celltypes), nrow = length(celltypes)))
colnames(suppdf) <- celltypes
rownames(suppdf) <- celltypes
for(i in 1:(length(celltypes)-1)){
  for(j in (i+1):length(celltypes)){
    print(celltypes[i])
    print(celltypes[j])

    filename <- paste("DEGage_C/neg_celltype_comparisons/",celltypes[i],"v",celltypes[j], ".csv", sep = "")
    df <- read.csv(filename, header = TRUE)
    df <- df[!is.na(df$pval),]
    df <- df[!is.infinite(df$pval),]
    df <- df[df$FDR < 0.05,]
    genes <- df$X
    genes <- gsub(" ", "" ,genes)
    str <- ""
    for(k in genes){
      str <- paste(str, k, sep = ";")
    }
    str <- substr(str, 2, nchar(str))
    suppdf[i,j] = str
  }
}
write.csv(suppdf, "DEGage_C/supplementary tables/neg_celltype_comparisons_degs.csv")
write.csv(suppdf$Tissue_stem_cells, "DEGage_C/supplementary tables/neg_tissue_stem_cells.csv")

files <- list.files("DEGage_C/pos_v_neg_by_celltype/")
titles <- vector()
#gets rid of csv tag
for (i in 1:length(files)){
  titles <- c(titles, substr(files[i],1, nchar(files[i])-4))
}

suppdf2 <- data.frame("Celltype"=1, "DEGs" = 1)
for(i in 1:length(files)){
  print(files[i])
  df <- read.csv(paste("DEGage_C/pos_v_neg_by_celltype/", files[i], sep = ""))
  df <- find_infs(df, 10)
  df <- df[!is.na(df$pval),]
  df <- df[!is.infinite(df$pval),]
  df <- df[df$FDR < 0.05,]
  genes <- df$X
  genes <- gsub(" ", "" ,genes)
  str <- ""
  for(k in genes){
    str <- paste(str, k, sep = ";")
  }
  str <- substr(str, 2, nchar(str))
  suppdf2 <- rbind(suppdf2, data.frame("Celltype" = titles[i], "DEGs" = str))
}
write.csv(suppdf2, "DEGage_C/supplementary tables/pos_v_neg_by_celltype_degs.csv")

##Performing 1 v all marker gene assessment

library("DEGage")

neg_seurat <- combined[,group.pos.neg == 1]
pos_seurat <- combined[,group.pos.neg == 0]

indicies <- 1:length(celltypes)

#positive
for(i in indicies[2:length(indicies)]){
  counts <- cbind(as.data.frame(subset(combined[,group.pos.neg == 0], idents = celltypes[i])@assays$RNA@counts),
                as.data.frame(subset(combined[,group.pos.neg == 0], idents = celltypes[indicies[!(indicies %in% i)]])@assays$RNA@counts))

  g1 <- rep(0,ncol(subset(combined[,group.pos.neg == 0], idents = celltypes[i])@assays$RNA@counts))
  g2 <- rep(1,ncol(subset(combined[,group.pos.neg == 0], idents = celltypes[indicies[!(indicies %in% i)]])@assays$RNA@counts))
  groups <- factor(c(g1,g2))

  res <- DEGage(counts, groups, nsubsample = 500)

  path <- paste("1vall/", celltypes[i],"_pos.csv", sep = "")
  write.csv(res, path)
}

#negative
for(i in indicies[2:length(indicies)]){
  counts <- cbind(as.data.frame(subset(combined[,group.pos.neg == 1], idents = celltypes[i])@assays$RNA@counts),
                  as.data.frame(subset(combined[,group.pos.neg == 1], idents = celltypes[indicies[!(indicies %in% i)]])@assays$RNA@counts))

  print("here")
  g1 <- rep(0,ncol(subset(combined[,group.pos.neg == 1], idents = celltypes[i])@assays$RNA@counts))
  g2 <- rep(1,ncol(subset(combined[,group.pos.neg == 1], idents = celltypes[indicies[!(indicies %in% i)]])@assays$RNA@counts))
  groups <- factor(c(g1,g2))

  res <- DEGage(counts, groups, nsubsample = 300)

  path <- paste("1vall/", celltypes[i],"_neg.csv", sep = "")
  write.csv(res, path)
}

#both
rm(pred.all)
rm(celltypesall)
rm(hpca.se)
for(i in indicies[17:length(indicies)]){
  counts <- cbind(as.data.frame(subset(combined, idents = celltypes[i])@assays$RNA@counts),
                  as.data.frame(subset(combined, idents = celltypes[indicies[!(indicies %in% i)]])@assays$RNA@counts))

  print("here")
  g1 <- rep(0,ncol(subset(combined, idents = celltypes[i])@assays$RNA@counts))
  g2 <- rep(1,ncol(subset(combined, idents = celltypes[indicies[!(indicies %in% i)]])@assays$RNA@counts))
  groups <- factor(c(g1,g2))

  res <- DEGage(counts, groups, nsubsample = 300)

  path <- paste("1vall/", celltypes[i],"_all.csv", sep = "")
  write.csv(res, path)
}

