library(monocle3)
library("dplyr")

# This file performs the comparative analysis for monocle3 on real positive
# controls, negative controls, and simulated data

### Positive Controls
PositiveControl <-read.delim("Comparative Analysis/PositiveControl.csv", header = TRUE, sep = ",")
rownames(PositiveControl) <-PositiveControl[,1]
PositiveControl <- PositiveControl[,-1]
Goldstandard <-read.delim("Comparative Analysis/goldstandard_top1000DEGs.txt", header = TRUE, sep = "\n")
PositiveControl <- PositiveControl[-1:-8,-1:-6]
PositiveControl <- PositiveControl[substr(rownames(PositiveControl),1,2) != "r_",]
PositiveControl <- PositiveControl[,-93:-96]

cell_metadata = data.frame(Group = c(rep(1,48), rep(2,44)), row.names = colnames(PositiveControl))
gene_metadata = data.frame(gene_short_name = rownames(PositiveControl), row.names = rownames(PositiveControl))
pc.cds <- new_cell_data_set(data.matrix(PositiveControl), cell_metadata = cell_metadata, gene_metadata = gene_metadata)

starttime <- Sys.time()
gene_fits <- fit_models(pc.cds, model_formula_str = "~Group")
fit_coefs <- coefficient_table(gene_fits)
pc.runtime <- Sys.time() - starttime

intermediate <- fit_coefs %>% filter(term == "Group")
pc.degs <-intermediate %>% filter (p_value < 0.05) %>%select(gene_short_name,p_value)
pc.degs <- as.data.frame(pc.degs)


pcngenes <- nrow(pc.degs)
pc.deglist <- pc.degs[,1]

Goldstandard <- unlist(Goldstandard)
commongenes <- c("Filler")
for(i in 1:length(pc.deglist)){
  if (pc.deglist[i] %in% Goldstandard ){
    commongenes <- c(commongenes, pc.deglist[i])
  }
}
commongenes <-commongenes[-1]

sensitivity <- length(commongenes)/1000

#Negative Controls
path <- "Comparative Analysis/Datasets/"
ncfiles <- list.files(path)
ncfiles <- ncfiles[grep("NC", ncfiles)]
ncdf <- data.frame()
for(i in 1:length(ncfiles)){
  print(i)
  ncpath <- paste(path, "NC", i, ".csv", sep = "")
  NegativeControl <-read.csv(ncpath, header = TRUE, sep = ",", skipNul = TRUE, encoding = "UTF-8")
  rownames(NegativeControl) <- make.unique(NegativeControl$X)
  NegativeControl <- NegativeControl[,-1]

  cell_metadata = data.frame(Group = c(rep(1,38), rep(2,38)), row.names = colnames(NegativeControl))
  gene_metadata = data.frame(gene_short_name = rownames(NegativeControl), row.names = rownames(NegativeControl))
  nc.cds <- new_cell_data_set(data.matrix(NegativeControl), cell_metadata = cell_metadata, gene_metadata = gene_metadata)

  starttime <- Sys.time()
  gene_fits <- fit_models(nc.cds, model_formula_str = "~Group")
  fit_coefs <- coefficient_table(gene_fits)
  nc.runtime <- Sys.time() - starttime

  intermediate <- fit_coefs %>% filter(term == "Group")
  intermediate <-intermediate %>% filter (p_value < 0.05) %>%select(gene_short_name,p_value)
  ncDEGresults <- as.data.frame(intermediate, row.names = gene_short_name)
  rownames(ncDEGresults) <- ncDEGresults[,1]

  ncngenes <- nrow(ncDEGresults)
  fprate <- ncngenes/nrow(NegativeControl)
  specificity <- (nrow(NegativeControl)- ncngenes)/nrow(NegativeControl)
  false.positive.rate <- nrow(ncngenes)/nrow(NegativeControl)
  tempdf <- data.frame(spec = specificity, fprate = false.positive.rate, ndegs = ncngenes)
  ncdf <- rbind(ncdf, tempdf)
}
write.csv(ncdf, "Comparative Analysis/ncresults/monocle3.csv")


#Simulated Data


path <- "Comparative Analysis/Datasets/"
files <- list.files(path)
files <- files[32:41]
finalmonocledf <- data.frame()
for (i in 2:length(files)){
  print(files[i])
  SD <- read.delim(paste(path,files[i], sep = ""), header = TRUE, sep = ",")
  rownames(SD) <- SD$X
  SD <- SD[,-1]

  cell_metadata = data.frame(Group = c(rep(1,75), rep(2,75)), row.names = colnames(SD))
  gene_metadata = data.frame(gene_short_name = rownames(SD), row.names = rownames(SD))
  sim.cds <- new_cell_data_set(data.matrix(SD), cell_metadata = cell_metadata, gene_metadata = gene_metadata)

  starttime <- Sys.time()
  gene_fits <- fit_models(sim.cds, model_formula_str = "~Group")
  fit_coefs <- coefficient_table(gene_fits)
  sim.runtime <- difftime(Sys.time(), starttime, units = "sec")

  intermediate <- fit_coefs %>% filter(term == "Group")
  sim.degs <-intermediate %>% filter (p_value < 0.05) %>%select(gene_short_name,p_value)
  sim.degs <- as.data.frame(sim.degs)
  rownames(sim.degs) <- sim.degs[,1]


  expDEs <- as.vector(sim.degs[,1])

  trueDEs <- rownames(SD[1:2000,])
  trueEEs <- rownames(SD[2001:20000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i], DEGs= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = as.double(sim.runtime*60))

  finalmonocledf <- rbind(finalmonocledf , simdf)
  write.csv(finalmonocledf, "Comparative Analysis/EE_results/monocle.csv")
}


