library(scDD)
library(Linnorm)
library(SingleCellExperiment)
library(stringi)

#Simulation
SeedData <- read.delim("revised_dropout/cellcycle.txt", sep = "\t", header = TRUE)
rownames(SeedData) <- SeedData[,1]
SeedData <- SeedData[,-1]
SeedData1 <- SeedData[,stri_detect_fixed(colnames(SeedData), "T0")]
SeedData2 <- SeedData[,stri_detect_fixed(colnames(SeedData), "T72")]
SeedData <- cbind(SeedData1, SeedData2)
SeedData <- SeedData[rowSums(counts == 0)/ncol(SeedData) < .8,]

SeedData <- data.matrix(SeedData)
SeedDatasce <- SingleCellExperiment(list(normcounts = SeedData),
                                    colData =data.frame(condition = c(rep(1, ncol(SeedData1)), rep(2,ncol(SeedData2)))))
path <- "revised_dropout/difcellnumdata/"
ncells <- c(20,50,100,250,500,1000,2500,5000,10000)
ncells <- ncells/2
for(i in ncells){
  for(j in 1:5){
    SD <- simulateSet(SeedDatasce, numSamples = i,
                      nDE = 2000, nEE = 18000, nDB = 0, nDP = 0, nDM= 0, nEP =0, plots = FALSE)
    SD <- as.data.frame(normcounts(SD))
    filename <- paste(path, i*2,"_cells_" ,j,".csv",sep = "")
    write.csv(SD, filename)
  }
}

###DEGAGE Testing
library("DEGage")

path <- "revised_dropout/difcellnumdata/"
files <- list.files(path)
DEgage.finaldf <- data.frame()
#14 doesnt work
for(i in 1:length(files)){
   print(files[i])

  underscore_index <- unlist(gregexpr("_",files[i]))[1]
  ncells <- as.numeric(substr(files[i], 1, underscore_index-1))
  group <- factor(c(rep(1, ncells/2),rep(2, ncells/2)))

  file <- read.csv(paste(path, files[i], sep = ""))
  rownames(file) <- file[,1]
  file <- file[,-1]
  starttime <- Sys.time()
  DEgage.results <- DEGage(file, group, perm.preprocess = FALSE, nsubsample = 3000)
  DEgage.runtime <- difftime(Sys.time(), starttime, units = "secs")
  write.csv(DEgage.results, paste("revised_dropout/diffcellno_DEGage_c/", files[i], sep = ""))

  nNAs <- nrow(DEgage.results[is.na(DEgage.results$pval),])
  nas <- DEgage.results[is.na(DEgage.results$pval),]
  DEgage.results<- DEgage.results[!is.na(DEgage.results$pval),]
  expDEs<- rownames(DEgage.results[DEgage.results$pval <= 0.05,])

  trueDEs <- rownames(file[1:2000,])

  trueEEs <- rownames(file[2001:20000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  DEgage.sensitivity <- tp/(tp+fn)
  DEgage.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)

  DEgage.df <- data.frame("Trial" = files[i], DEGs= length(expDEs), "Sensitivity" = DEgage.sensitivity, "Specificity" = DEgage.specificity, "Precision" = precision,
                          "Accuracy" = accuracy, "F1" = f1, "Runtime" = DEgage.runtime)


  DEgage.finaldf <- rbind(DEgage.finaldf, DEgage.df)
  write.csv(DEgage.finaldf,"revised_dropout/DEGage_C.csv")
}

###DESEQ2 Testing
library("DESeq2")
path <- "revised_dropout/difcellnumdata/"
files <- list.files(path)
DESeq2.finaldf <- data.frame()
for(i in 1:length(files)){
  print(files[i])

  underscore_index <- unlist(gregexpr("_",files[i]))[1]
  ncells <- as.numeric(substr(files[i], 1, underscore_index-1))

  file <- read.csv(paste(path, files[i], sep = ""))
  rownames(file) <- file[,1]
  file <- file[,-1]

  file <- as.data.frame(file)
  for(i in 1:ncol(file)){
    file[,i] <- as.integer(file[,i])
  }
  file[1501,] = file[1501,]+1

  filler <- matrix(c(rep("Group.1",ncells/2), rep("Group.2", ncells/2)), nrow = ncells, dimnames = list(colnames(file), 'Group'))
  DESeqobj <- DESeqDataSetFromMatrix(countData = file, colData = filler, design = ~Group)
  starttime <- Sys.time()
  sim.Deseq <-DESeq(DESeqobj)
  sim.endtime <- Sys.time()
  sim.runtime <- difftime(Sys.time(), starttime, units = "secs")
  simresults <- results(sim.Deseq)

  expDEs <- simresults[is.na(simresults$padj) == FALSE,]
  expDEs <- rownames(expDEs[expDEs$padj <= 0.05,])

  trueDEs <- rownames(file[1:2000,])
  trueEEs <- rownames(file[2001:20000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)


  simdf <- data.frame("Trial" = files[i], "DEGs"= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = sim.runtime)

  DESeq2.finaldf <- rbind(DESeq2.finaldf, simdf)
  write.csv(DESeq2.finaldf,"revised_dropout/DESeq2_Zero_Results_diffcellno.csv")
}

library(monocle3)
library(dplyr)
path <- "revised_dropout/difcellnumdata/"
files <- list.files(path)
monocle.finaldf <- data.frame()
for (i in 1:length(files)){
  print(files[i])

  underscore_index <- unlist(gregexpr("_",files[i]))[1]
  ncells <- as.numeric(substr(files[i], 1, underscore_index-1))

  file <- read.csv(paste(path, files[i], sep = ""))
  rownames(file) <- file[,1]
  file <- file[,-1]

  cell_metadata = data.frame(Group = c(rep(1,ncells/2), rep(2,ncells/2)), row.names = colnames(file))
  gene_metadata = data.frame(gene_short_name = rownames(file), row.names = rownames(file))
  sim.cds <- new_cell_data_set(data.matrix(file), cell_metadata = cell_metadata, gene_metadata = gene_metadata)

  starttime <- Sys.time()
  gene_fits <- fit_models(sim.cds, model_formula_str = "~Group")
  fit_coefs <- coefficient_table(gene_fits)
  sim.runtime <- difftime(Sys.time(), starttime, units = "secs")
  intermediate <- fit_coefs %>% filter(term == "Group")
  sim.degs <-intermediate %>% filter (p_value < 0.05) %>%select(gene_short_name,p_value)
  sim.degs <- as.data.frame(sim.degs)
  rownames(sim.degs) <- sim.degs[,1]


  expDEs <- as.vector(sim.degs[,1])

  trueDEs <- rownames(file[1:2000,])
  trueEEs <- rownames(file[2001:20000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" =  files[i], DEGs= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = sim.runtime)
  monocle.finaldf <- rbind(monocle.finaldf, simdf)
  write.csv(monocle.finaldf, "revised_dropout/monocle_diffcellno.csv")

}


library(scDD)
path <- "revised_dropout/difcellnumdata/"
files <- list.files(path)
scDD.finaldf <- data.frame()
for (i in 1:length(files)){
  print(files[i])

  underscore_index <- unlist(gregexpr("_",files[i]))[1]
  ncells <- as.numeric(substr(files[i], 1, underscore_index-1))

  file <- read.csv(paste(path, files[i], sep = ""))
  rownames(file) <- file[,1]
  file <- file[,-1]

  simsce <- SingleCellExperiment(assays = list(counts =  file))
  colData(simsce)$condition = c(rep("Group.1", ncells/2), rep("Group.2", ncells/2))
  simsce <- preprocess(simsce, scran_norm = TRUE)

  starttime = Sys.time()
  sim.scDD.output <- scDD(simsce)
  sim.runtime <- difftime(Sys.time(), starttime, units = "secs")

  sim.results <- results(sim.scDD.output)
  sim.degs <- sim.results[sim.results$nonzero.pvalue.adj <= 0.05,]

  expDEs <- rownames(sim.degs)
  trueDEs <- rownames(file[1:2000,])
  trueEEs <- rownames(file[2001:20000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i], DEGs= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = sim.runtime)

  scDD.finaldf <- rbind(scDD.finaldf, simdf)
  write.csv(scDD.finaldf, "revised_dropout/scDD_diffcellno.csv")
}

library(DEsingle)
path <-"revised_dropout/difcellnumdata/"
files <- list.files(path)
DEsingle.finaldf <- data.frame()
for (i in 1:length(files)){
  print(files[i])

  underscore_index <- unlist(gregexpr("_",files[i]))[1]
  ncells <- as.numeric(substr(files[i], 1, underscore_index-1))

  file <- read.csv(paste(path, files[i], sep = ""))
  rownames(file) <- file[,1]
  file <- file[,-1]

  simgroups <- factor(c(rep(1,ncells/2), rep(2,ncells/2)))
  sim.starttime <- Sys.time()
  simresults <- DEsingle(file, simgroups)
  sim.endtime <- Sys.time()
  sim.runtime <- difftime(Sys.time(), starttime, units = "secs")

  simresults <- DEtype(simresults, threshold = 0.05)

  expDEs <- simresults[is.na(simresults$pvalue.adj.FDR) == FALSE,]
  expDEs <- rownames(expDEs[expDEs$pvalue.adj.FDR <= 0.05,])

  trueDEs <- rownames(file[1:2000,])
  trueEEs <- rownames(file[2001:20000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i], DEGs= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = sim.runtime)

  DEsingle.finaldf <- rbind(DEsingle.finaldf, simdf)
  write.csv(DEsingle.finaldf, "revised_dropout/Desingle_diffcellno.csv")
}

library(edgeR)
path <-"revised_dropout/difcellnumdata/"
files <- list.files(path)
edger.finaldf <- read.csv("revised_dropout/edger_diffcellnoresults.csv")
edger.finaldf <- edger.finaldf[,-1]
for (i in 1:length(files)){
  print(files[i])

  underscore_index <- unlist(gregexpr("_",files[i]))[1]
  ncells <- as.numeric(substr(files[i], 1, underscore_index-1))

  file <- read.csv(paste(path, files[i], sep = ""))
  rownames(file) <- file[,1]
  file <- file[,-1]


  group <- matrix(c(rep("Group1", ncells/2),rep("Group2",ncells/2)),nrow = ncells, dimnames = list(colnames(file), 'Group'))
  group <-factor(group)

  list <-DGEList(file)

  design <- model.matrix(~0+group)
  colnames(design) <- levels (group)
  starttime <- Sys.time()

  AveLogCPM <-aveLogCPM(list)
  list <- calcNormFactors(list)
  list <- estimateDisp(list, design, Robust = TRUE)
  fit <-glmQLFit(list, design, robust = TRUE)
  onev.two <-makeContrasts(Group1-Group2, levels = design)
  res <- glmQLFTest(fit, contrast = onev.two)
  sim.runtime <- difftime(Sys.time(), starttime, units = "secs")

  topgenes <- as.data.frame(topTags(res, n =20000 ,adjust.method = "fdr",p.value = 0.05))
  expDEs <- rownames(topgenes)

  trueDEs <- rownames(file[1:2000,])
  trueEEs <- rownames(file[2001:20000,])

  tp <- sum(trueDEs %in% expDEs)

  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i], "DEGs"= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = sim.runtime)

  edger.finaldf <- rbind(edger.finaldf, simdf)
  write.csv(edger.finaldf,'revised_dropout/edger_diffcellnoresults.csv')
}

##Analysis
library(ggplot2)
path <- "diffcellnoresults/"
files <- list.files(path)
dfs <- lapply(paste(path,files, sep = ""), FUN = read.csv)

increments <- ncells <- c(20,50,100,250,500,1000,2500)
trials <- c(rep(100, 5),rep(1000, 5),rep(20, 5),rep(250, 5),rep(2500, 5),rep(50, 5),rep(500, 5))
packages <- c("DEGage","DESeq2", "DESingle", "edger", "monocle3", "scDD")

#this was run twice, once with sensitivity as the target parameter and once with
#runtimes
plotdf <- data.frame()
for(i in 1:6){
  tempdf <- dfs[[i]]
  tempdf$Trial <- trials
  for (j in increments){
    vals <- tempdf[tempdf$Trial == j,]$Runtime#Put any parameter here you want
    vals <- vals[!is.na(vals)]
    mean <- mean(vals)
    sd <- sd(vals)
    df <- data.frame(Package = packages[i], Ncells = j, Mean = mean, sd = sd )
    plotdf <- rbind(plotdf, df)
    }
}

plotdf$Mean[8:14] <- plotdf[8:14,]$Mean/60
plotdf$sd[8:14] <- plotdf[8:14,]$sd/60


library(ggplot2)
library(ggsci)
Runtime.plt <- ggplot(plotdf, aes(x = Ncells, y = Mean/60, col = Package, ymin = (Mean-sd)/60, ymax = (Mean+sd)/60))+
  geom_line(linewidth = 0.75)+
  geom_errorbar(linewidth = 0.75, width = 35)+
  labs(x = "Number Of Cells", y = "Mean Runtime (Min)")+
  ggtitle("Runtimes")+
  theme_light()+
  theme(plot.title = element_text(hjust = 0.5))+
  scale_color_npg()+
  coord_cartesian(xlim = c(0,20),ylim=c(0, 250))
Runtime.plt

sensitivity.plt <- ggplot(plotdf, aes(x = Ncells, y = Mean, col = Package, ymin = (Mean-sd), ymax = (Mean+sd)))+
  geom_line(linewidth = 0.75)+
  geom_errorbar(linewidth = 0.75,width =7)+
  labs(x = "Number Of Cells", y = "Average Sensitivity")+
  theme_light()+
  theme(plot.title = element_text(hjust = 0.5))+
  coord_cartesian(xlim=c(0, 2500))
sensitivity.plt

plotdf$Mean <- plotdf$Mean/60
plotdf$sd <- plotdf$sd/60
write.csv(plotdf,"Supplementary_runtime_table.csv")













