library(stringi)
library(DEGageAP)
setwd("C:/Users/alici/Documents/Binf Research/DOTNBv2/revised_dropout")

## This file tests the performance of each package on DOTNB dropout sets with varying proportions of zeros
### Dataset generation
library(DEGage)
groups <- factor(c(rep(1,75), rep(2,75)))
proplist <- c(0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.50,0.55,0.6)

#A lazy approach was taken here by an undergraduate. This was run five times, manually
# changing all of the labels.
plist.forstring <- c("005","055","105","155","205","255","305","355","405","455","505","555","605")
for(i in 1:length(proplist)){
  print(i)
  df <- DEGage_Simulation(ngenes = 13500, ndegs = 1500, cellgroups = groups, prop.zeros = proplist[i])
  path <- paste("DropoutDatasets/",plist.forstring[i] , ".csv", sep= "")
  write.csv(x=df, file = path)
}

### Trials
#DEgage
path <- "revised_dropout/DropoutDatasets/"
files <- list.files(path)
DEgage.finaldf <- data.frame()
for(i in 1:length(files)){
  print(files[i])
  file <- read.csv(paste(path, files[i], sep = ""))
  rownames(file) <- file[,1]
  file <- file[,-1]
  groups <- factor(c(rep(1,75), rep(2,75)))
  DEgage.starttime <- Sys.time()
  DEgage.results <- DEGage(file, groups)
  DEgage.endtime <- Sys.time()
  DEgage.runtime <- DEgage.endtime - DEgage.starttime

  nNAs <- nrow(DEgage.results[is.na(DEgage.results$pval),])
  nas <- DEgage.results[is.na(DEgage.results$pval),]
  DEgage.results<- DEgage.results[!is.na(DEgage.results$pval),]
  write.csv(DEgage.results, paste("revised_dropout/Dropout_props_DEGage_c/", files[i], sep = ""))
  DEgage.results <- DEgage.results[DEgage.results$permPvals < 0.05,]
  expDEs<- rownames(DEgage.results[DEgage.results$pval <= 0.05,])

  trueDEs <- rownames(file[1:1500,])

  trueEEs <- rownames(file[1501:15000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  DEgage.sensitivity <- tp/(tp+fn)
  DEgage.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/15000
  f1 <- (2*tp)/(2*tp+fp+fn)

  DEgage.df <- data.frame("Trial" = files[i], DEGs= length(expDEs), "Sensitivity" = DEgage.sensitivity, "Specificity" = DEgage.specificity, "Precision" = precision,
                    "Accuracy" = accuracy, "F1" = f1, "Runtime" = as.double(DEgage.runtime*60), "Returned" = nrow(DEgage.results))

  DEgage.finaldf <- rbind(DEgage.finaldf, DEgage.df)
  write.csv(DEgage.finaldf,"revised_dropout/zero_prop_Results/DEGage_c.csv")
}

library(edgeR)
path <- "revised_dropout/DropoutDatasets/"
files <- list.files(path)
edger.finaldf <- data.frame()
for (i in 1:length(files)){
  print(files[i])
  file <- read.csv(paste(path, files[i], sep = ""))
  rownames(file) <- file[,1]
  file <- file[,-1]

  group <- matrix(c(rep("Group1", 75),rep("Group2",75)),nrow = 150, dimnames = list(colnames(file), 'Group'))
  group <-factor(group)

  list <-DGEList(file)

  design <- model.matrix(~0+group)
  colnames(design) <- levels (group)

  #DEG Analysis
  sim.starttime <- Sys.time()

  AveLogCPM <-aveLogCPM(list)
  list <- calcNormFactors(list)
  list <- estimateDisp(list, design, Robust = TRUE)
  fit <-glmQLFit(list, design, robust = TRUE)

  onev.two <-makeContrasts(Group1-Group2, levels = design)

  res <- glmQLFTest(fit, contrast = onev.two)

  sim.endtime <- Sys.time()
  sim.runtime <- sim.endtime - sim.starttime

  #Analyzing results. Total number of DEG's are found. Sensitivity is found by comparing DEG list to goldstandard list
  topgenes <- as.data.frame(topTags(res, n =15000 ,adjust.method = "fdr",p.value = 0.05))

  expDEs <- rownames(topgenes)

  trueDEs <- rownames(SD[1:1500,])
  trueEEs <- rownames(SD[1501:15000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/15000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i], "DEGs"= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = as.double(sim.runtime*60))

  edger.finaldf <- rbind(edger.finaldf, simdf)

}
 write.csv(edger.finaldf,'revised_dropout/edger_zero_results.csv')


 #DESeq2

 library(DESeq2)
 path <-"revised_dropout/DropoutDatasets/"
 files <- list.files(path)
 deseq2.finaldf <- data.frame()
 for (i in 1:length(files)){
   print(files[i])
   file <- read.csv(paste(path, files[i], sep = ""))
   rownames(file) <- file[,1]
   file <- file[,-1]

   file[1501,] = file[1501,]+1
   filler <- matrix(c(rep("Group.1", 75), rep("Group.2", 75)), nrow = 150, dimnames = list(colnames(file), 'Group'))
   DESeqobj <- DESeqDataSetFromMatrix(countData = file, colData = filler, design = ~Group)
   sim.starttime <- Sys.time()
   sim.Deseq <-DESeq(DESeqobj)
   sim.endtime <- Sys.time()
   sim.runtime.DEseq <- sim.endtime - sim.starttime
   simresults <- results(sim.Deseq)

   expDEs <- simresults[is.na(simresults$padj) == FALSE,]
   expDEs <- rownames(expDEs[expDEs$padj <= 0.05,])

   trueDEs <- rownames(file[1:1500,])
   trueEEs <- rownames(file[1501:15000,])

   tp <- sum(trueDEs %in% expDEs)
   fp <- sum(trueEEs %in% expDEs)
   tn <- sum(!(trueEEs %in% expDEs))
   fn <- sum(!(trueDEs %in% expDEs))

   sim.sensitivity <- tp/(tp+fn)
   sim.specificity <- tn/(tn+fp)
   precision <- tp/(tp+fp)
   accuracy <- (tp +tn)/15000
   f1 <- (2*tp)/(2*tp+fp+fn)

   simdf <- data.frame("Trial" = files[i], "DEGs"= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                       "Accuracy" = accuracy, "F1" = f1, "Runtime" = as.double(sim.runtime.DEseq*60), "Returned" = nrow(simresults))

   deseq2.finaldf <- rbind(deseq2.finaldf, simdf)

 }

 write.csv(deseq2.finaldf, "revised_dropout/DEseq2.csv")


 #Monocle

 library(monocle3)
 library(dplyr)
 path <-"revised_dropout/DropoutDatasets/"
 files <- list.files(path)
 monocle.finaldf <- data.frame()
 for (i in 1:length(files)){
   print(files[i])
   file <- read.csv(paste(path, files[i], sep = ""))
   rownames(file) <- file[,1]
   file <- file[,-1]

   cell_metadata = data.frame(Group = c(rep(1,75), rep(2,75)), row.names = colnames(file))
   gene_metadata = data.frame(gene_short_name = rownames(file), row.names = rownames(file))
   sim.cds <- new_cell_data_set(data.matrix(file), cell_metadata = cell_metadata, gene_metadata = gene_metadata)

   starttime <- Sys.time()
   gene_fits <- fit_models(sim.cds, model_formula_str = "~Group")
   fit_coefs <- coefficient_table(gene_fits)
   sim.runtime <- Sys.time() - starttime

   intermediate <- fit_coefs %>% filter(term == "Group")
   sim.degs <-intermediate %>% filter (p_value < 0.05) %>%select(gene_short_name,p_value)
   sim.degs <- as.data.frame(sim.degs)
   rownames(sim.degs) <- sim.degs[,1]


   expDEs <- as.vector(sim.degs[,1])

   trueDEs <- rownames(SD[1:1500,])
   trueEEs <- rownames(SD[1501:15000,])

   tp <- sum(trueDEs %in% expDEs)
   fp <- sum(trueEEs %in% expDEs)
   tn <- sum(!(trueEEs %in% expDEs))
   fn <- sum(!(trueDEs %in% expDEs))

   sim.sensitivity <- tp/(tp+fn)
   sim.specificity <- tn/(tn+fp)
   precision <- tp/(tp+fp)
   accuracy <- (tp +tn)/15000
   f1 <- (2*tp)/(2*tp+fp+fn)

   simdf <- data.frame("Trial" =  files[i], DEGs= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                       "Accuracy" = accuracy, "F1" = f1, "Runtime" = as.double(sim.runtime*60), "Returned" = nrow(simresults))

   monocle.finaldf <- rbind(monocle.finaldf, simdf)
 }
write.csv(monocle.finaldf, "revised_dropout/monsocle_Zero_Results.csv")


#DEsingle


library(DEsingle)
path <- "revised_dropout/DropoutDatasets/"
files <- list.files(path)
DEsingle.finaldf <- data.frame()
for (i in 1:length(files)){
  print(files[i])
  file <- read.csv(paste(path, files[i], sep = ""))
  rownames(file) <- file[,1]
  file <- file[,-1]

  simgroups <- factor(c(rep(1,75), rep(2,75)))
  sim.starttime <- Sys.time()
  simresults <- DEsingle(file, simgroups)
  sim.endtime <- Sys.time()
  sim.runtime <- sim.endtime -sim.starttime

  simresults <- DEtype(simresults, threshold = 0.05)

  expDEs <- simresults[is.na(simresults$pvalue.adj.FDR) == FALSE,]
  expDEs <- rownames(expDEs[expDEs$pvalue.adj.FDR <= 0.05,])

  trueDEs <- rownames(SD[1:1500,])
  trueEEs <- rownames(SD[1501:15000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/15000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i], DEGs= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = as.double(sim.runtime*60), "Returned" = nrow(simresults))

  DEsingle.finaldf <- rbind(DEsingle.finaldf, simdf)
  write.csv(DEsingle.finaldf, "revised_dropout/Desingle.csv")
}

##SCDD
library(scDD)
path <- "revised_dropout/DropoutDatasets/"
files <- list.files(path)
scDD.finaldf <- data.frame()
for (i in 1:length(files)){
  print(files[i])
  file <- read.csv(paste(path, files[i], sep = ""))
  rownames(file) <- file[,1]
  file <- file[,-1]

  simsce <- SingleCellExperiment(assays = list(counts =  file))
  colData(simsce)$condition = c(rep("Group.1", 75), rep("Group.2", 75))
  simsce <- preprocess(simsce, scran_norm = TRUE)

  starttime = Sys.time()
  sim.scDD.output <- scDD(simsce)
  sim.runtime <- Sys.time() - starttime

  sim.results <- results(sim.scDD.output)
  sim.degs <- sim.results[sim.results$nonzero.pvalue.adj <= 0.05,]

  expDEs <- rownames(sim.degs)

  trueDEs <- rownames(SD[1:1500,])
  trueEEs <- rownames(SD[1501:15000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/15000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i], DEGs= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = as.double(sim.runtime*60), "Returned" = nrow(simresults))

  scDD.finaldf <- rbind(scDD.finaldf, simdf)
  write.csv(scDD.finaldf, "revised_dropout/scdd.csv")
}



######################################################################
#ANALYSIS
######################################################################
result.files <- list.files("zero_prop_Results/")
result.files <- paste( "zero_prop_Results/",result.files, sep = "")
result.list <-lapply(result.files, read.csv, sep=",")
increments <- c("00","05","10","15","20","25","30","35","40","45","50","55","60")
increments <- c("00","10","20","30","40","50","60")

plotdf <- data.frame()

for(i in 1:6){
  for(j in 1:length(increments)){
    temp = result.list[[i]]
    vals<- temp[which(substr(temp$Trial, 0, 2) == increments[j]),]$Sensitivity#put any parameter of interest here
    vals <- as.numeric(vals)
    mean <- mean(vals)
    sd<- sd(vals)
    tempdf <- data.frame("Package" = result.files[i], "prop" = increments[j], "mean" = mean, "sd" = sd )
    plotdf <- rbind(plotdf, tempdf)
  }
}
plotdf$prop <- as.numeric(plotdf$prop)


##plotdf now contains mean and sd for each package at each proportion

library(ggplot2)
library(ggsci)
packagelabels <- c("DEGage", "DESeq2", "DESingle", "edger", "monocle3", "scDD")
colors <- pal_npg("nrc", alpha = 0.7)(6)

ggplot(plotdf, aes(x = prop, y = mean, col = Package, ymin = mean-sd, ymax = mean+sd))+
    geom_line(linewidth = 1)+
    geom_errorbar(linewidth = 1, width = 1)+
  scale_color_npg(labels = packagelabels)+
  labs(x = "Proportion Of Zero Counts", y = "Sensitivity")+
  theme_minimal()
