library('DEGage')

# This file performs the comparative analysis for DEGage on real positive
# controls, negative controls, and simulated data

### Positive Controls real da

PositiveControl <-read.delim("Comparative Analysis/PositiveControl.csv", header = TRUE, sep = ",")
rownames(PositiveControl) <-PositiveControl[,1]
PositiveControl <- PositiveControl[,-1]
Goldstandard <-read.delim("Comparative Analysis/goldstandard_top1000DEGs.txt", header = TRUE, sep = "\n")
PositiveControl <- PositiveControl[-1:-8,-1:-6]
PositiveControl <- PositiveControl[substr(rownames(PositiveControl),1,2) != "r_",]
PositiveControl <- PositiveControl[,-93:-96]

# run trial
x <- factor(c(rep(1,48), rep(2,44)))
pc.starttime <- Sys.time()
pcresults <- DEGage(PositiveControl, x)
pc.endtime <- Sys.time()
pc.runtime <- pc.endtime - pc.starttime

nNAs <- sum(is.na(pcresults$pval))
pcresults<- pcresults[!is.na(pcresults$pval),]
pcdegs<- pcdegs[pcdegs$FDR <= 0.05,]

genenames <- rownames(pcdegs)

# find DEGs in the gold standard gene set
samegenes <- c("")
for(i in genenames){
  if (any(Goldstandard == i)){
    samegenes <- c(samegenes, i)
  }
}

# output sensitivity
sensitivity <- length(samegenes)/nrow(Goldstandard)
sensitivity
nrow(pcdegs)

### Negative Controls real data
path <- "Comparative Analysis/Datasets/"
ncfiles <- list.files(path)
ncfiles <- ncfiles[grep("NC", ncfiles)]
ncdf <- data.frame()
for(i in 1:length(ncfiles)){
  print(i)
  ncpath <- paste(path, "NC", i, ".csv", sep = "")
  NegativeControl <-read.csv(ncpath, header = TRUE, sep = ",", skipNul = TRUE, encoding = "UTF-8")
  rownames(NegativeControl) <- make.unique(NegativeControl$X)
  NegativeControl <- NegativeControl[,-1]

  x<-factor(c(rep(1, 38),rep(2,38)))
  nc.starttime <- Sys.time()
  ncresults <- DEGage(NegativeControl, x)
  nc.endtime <- Sys.time()
  nc.runtime <-nc.endtime - nc.starttime

  ncresults<- ncresults[!is.na(ncresults$FDR),]
  ncdegs<- ncresults[ncresults$FDR < 0.05,]

  specificity = (nrow(NegativeControl)-nrow(ncdegs))/nrow(NegativeControl)

  false.positive.rate <- nrow(ncdegs)/nrow(NegativeControl)
  tempdf <- data.frame(spec = specificity, fprate = false.positive.rate, ndegs = nrow(ncdegs))
  ncdf <- rbind(ncdf, tempdf)
}
write.csv(ncdf, "Comparative Analysis/ncresults/DEGage.csv")

### Simulated Data
path <- "Comparative Analysis/Datasets/"
files <- list.files(path)
files <- files[32:41] #get simulated datasets
i = 1
finaldf <- data.frame()
for (i in 1:length(files)){
  print(files[i])
  SD <- read.delim(paste(path,files[i], sep = ""), header = TRUE, sep = ",")
  rownames(SD) <- SD$X
  SD <- SD[,-1]

  x <- factor(c(rep(1,75), rep(2,75)))
  sim.starttime <- Sys.time()
  simresults <- DEGage(SD, x, perm.preprocess = FALSE, mean.ratio = 1.4, nperms = 10000)
  sim.endtime <- Sys.time()
  sim.runtime <- difftime(Sys.time(), sim.starttime, units = "sec")
  results <- simresults

  simresults<- simresults[!is.na(simresults$pval),]
  res <- simresults[simresults$permPvals < 0.1,]
  expDEs<- rownames(res[res$pval <= 0.05,])

  trueDEs <- rownames(SD[1:2000,])

  trueEEs <- rownames(SD[2001:nrow(SD),])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i],
                      DEGs= length(expDEs),
                      "Sensitivity" = sim.sensitivity,
                      "Specificity" = sim.specificity,
                      "Precision" = precision,
                      "Accuracy" = accuracy,
                      "F1" = f1,
                      "Runtime" = as.double(sim.runtime*60),
                      "Returned" = nrow(simresults))
  print(simdf)
  finaldf <- rbind(finaldf, simdf)
  write.csv(finaldf, "Comparative Analysis/EE_results/DEGage.csv")
}








