# This file performs the comparative analysis for wilcoxon on real positive
# controls, negative controls, and simulated data

### Positive Controls real data

PositiveControl <-read.delim("Comparative Analysis/PositiveControl.csv", header = TRUE, sep = ",")
rownames(PositiveControl) <-PositiveControl[,1]
PositiveControl <- PositiveControl[,-1]
Goldstandard <-read.delim("Comparative Analysis/goldstandard_top1000DEGs.txt", header = FALSE, sep = "\n")
PositiveControl <- PositiveControl[-1:-8,-1:-6]
PositiveControl <- PositiveControl[substr(rownames(PositiveControl),1,2) != "r_",]
PositiveControl <- PositiveControl[,-93:-96]

x <- factor(c(rep(1,48), rep(2,44)))
pc.starttime <- Sys.time()

pvals <- c()
for( i in 1:nrow(PositiveControl)){
  g1 <- as.numeric(PositiveControl[,x == 1][i,])
  g2 <- as.numeric(PositiveControl[,x == 2][i,])
  t <-wilcox.test(g1, g2)
  pvals <- c(pvals, t$p.value)
}
pcresults <- data.frame(gene = rownames(PositiveControl), pval = pvals, fdr = p.adjust(pvals, method = "fdr"))
pc.endtime <- Sys.time()
pc.runtime <- pc.endtime - pc.starttime

nNAs <- sum(is.na(pcresults$pval))
pcresults<- pcresults[!is.na(pcresults$pval),]
pcdegs<- pcresults[pcresults$fdr <= 0.05,]

samegenes <- length(which(pcdegs$gene %in% Goldstandard$V1))
sensitivity <- samegenes/nrow(Goldstandard)
sensitivity
nrow(pcdegs)

#negative controls
path <- "Comparative Analysis/Datasets/"
ncfiles <- list.files(path)
ncfiles <- ncfiles[grep("NC", ncfiles)]
ncdf <- data.frame()
for(i in 7:length(ncfiles)){
  print(i)
  ncpath <- paste(path, "NC", i, ".csv", sep = "")
  NegativeControl <-read.csv(ncpath, header = TRUE, sep = ",", skipNul = TRUE, encoding = "UTF-8")
  rownames(NegativeControl) <- make.unique(NegativeControl$X)
  pvals <- c()
  for( i in 1:nrow(NegativeControl)){
    g1 <- as.numeric(NegativeControl[,1:38][i,])
    g2 <- as.numeric(NegativeControl[,39:ncol(NegativeControl)][i,])
    t <-wilcox.test(g1, g2)
    pvals <- c(pvals, t$p.value)
  }
  ncresults <- data.frame(gene = rownames(NegativeControl), pval = pvals, FDR = p.adjust(pvals, method = "fdr"))
  x<-factor(c(rep(1, 38),rep(2,38)))
  nc.starttime <- Sys.time()

  nc.endtime <- Sys.time()
  nc.runtime <-nc.endtime - nc.starttime

  nc.nNAs <- sum(is.na(ncresults$FDR))
  ncresults<- ncresults[!is.na(ncresults$FDR),]
  ncdegs<- ncresults[ncresults$FDR < 0.05,]

  specificity = (nrow(NegativeControl)-nrow(ncdegs))/nrow(NegativeControl)

  false.positive.rate <- nrow(ncdegs)/nrow(NegativeControl)
  tempdf <- data.frame(spec = specificity, fprate = false.positive.rate, ndegs = nrow(ncdegs))
  ncdf <- rbind(ncdf, tempdf)
}
write.csv(ncdf, "Comparative Analysis/ncresults/wilcoxon.csv")

path <- "Comparative Analysis/Datasets/"
files <- list.files(path)
files <- files[32:41]
i = 1
finaldf <- data.frame()
for (i in 1:length(files)){
  print(files[i])
  SD <- read.delim(paste(path,files[i], sep = ""), header = TRUE, sep = ",")
  rownames(SD) <- SD$X
  SD <- SD[,-1]

  x <- factor(c(rep(1,75), rep(2,75)))
  sim.starttime <- Sys.time()
  pvals <- c()
  for( j in 1:nrow(SD)){
    g1 <- as.numeric(SD[,x == 1][j,])
    g2 <- as.numeric(SD[,x == 2][j,])
    t <-wilcox.test(g1, g2)
    pvals <- c(pvals, t$p.value)
  }
  simresults <- data.frame(gene = rownames(SD), pval = pvals, FDR = p.adjust(pvals, method = "fdr"))
  sim.endtime <- Sys.time()
  sim.runtime <- difftime(Sys.time(), sim.starttime, units = "sec")

  nNAs <- nrow(simresults[is.na(simresults$pval),])
  nas <- simresults[is.na(simresults$pval),]
  simresults<- simresults[!is.na(simresults$pval),]
  expDEs<- simresults[simresults$pval <= 0.05,]$gene

  trueDEs <- rownames(SD[1:2000,])

  trueEEs <- rownames(SD[2001:nrow(SD),])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i], DEGs= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = as.double(sim.runtime*60), "Returned" = nrow(simresults))
  print(simdf)
  finaldf <- rbind(finaldf, simdf)
}

write.csv(finaldf, 'Comparative Analysis/EE_results/wilcoxon.csv')







