library('DESeq2')
# This file performs the comparative analysis for DESeq2 on real positive
# controls, negative controls, and simulated data

### Positive Controls real data

PositiveControl <-read.delim("Comparative Analysis/PositiveControl.csv", header = TRUE, sep = ",")
rownames(PositiveControl) <-PositiveControl[,1]
PositiveControl <- PositiveControl[,-1]
Goldstandard <-read.delim("Comparative Analysis/goldstandard_top1000DEGs.txt", header = TRUE, sep = "\n")
PositiveControl <- PositiveControl[-1:-8,-1:-6]
PositiveControl <- PositiveControl[substr(rownames(PositiveControl),1,2) != "r_",]
PositiveControl <- PositiveControl[,-93:-96]

#Positive Control Testing#############################################################################################

#Makes Fake metadata and Deseqobject
coldat <-data.frame(condition = factor(c(rep("Stem Cells", 48),rep("Fibroblasts",44))))
DESeqPositiveControl <- DESeqDataSetFromMatrix(countData = PositiveControl, colData = coldat, design = ~condition)

#Differential Expression Analysis
pc.starttime <- Sys.time()
pc.deseq <- DESeq(DESeqPositiveControl)
pc.endtime <- Sys.time()
pc.runtime.DEseq <- pc.endtime - pc.starttime
Tpresults <- results(pc.deseq)

#Determining number of DEG's
orderedpvalues <-as.data.frame(Tpresults[order(Tpresults$padj, decreasing = FALSE),])

DEGs <- orderedpvalues[!is.na(orderedpvalues$padj),]
DEGs <- DEGs[DEGs$padj <= 0.05,]

pc.ngenes <- nrow(DEGs)

#Testing to see if experimental DEG's are in the GOld standard list
commongenes <- c("Filler")
for(i in 1:pc.ngenes){
  if (any(Goldstandard == rownames(DEGs[i,]))){
    commongenes <- c(commongenes, rownames(DEGs[i,]))
  }
}
commongenes <-commongenes[-1]
DESEQsensitivity <- (length(commongenes)-1)/nrow(Goldstandard)


#Negative Controls#######################################################################################
path <- "Comparative Analysis/Datasets/"
ncfiles <- list.files(path)
ncfiles <- ncfiles[grep("NC", ncfiles)]
ncdf <- data.frame()
for(i in 1:length(ncfiles)){
  print(i)
  ncpath <- paste(path, "NC", i, ".csv", sep = "")
  NegativeControl <-read.csv(ncpath, header = TRUE, sep = ",", skipNul = TRUE, encoding = "UTF-8")
  rownames(NegativeControl) <- make.unique(NegativeControl$X)
  NegativeControl <- NegativeControl[,-1]

  #Denormalizing (without known scale factor)
  NegativeControl <- NegativeControl*100.0
  NegativeControl <-lapply(NegativeControl, as.integer)
  NegativeControl <- as.data.frame(NegativeControl)
  NegativeControl <- na.omit(NegativeControl)
  NegativeControl <- NegativeControl +1

  #Building of DEseq object and DEG analysis
  filler <- matrix(c(rep("Group.1", 38), rep("Group.2", 38)), nrow = 76, dimnames = list(colnames(NegativeControl), 'Group'))
  DESeqobj <- DESeqDataSetFromMatrix(countData = NegativeControl, colData = filler, design = ~Group)
  nc.starttime <- Sys.time()
  nc.Deseq <-DESeq(DESeqobj)
  nc.endtime <- Sys.time()
  nc.runtime.DEseq <- nc.endtime - nc.starttime
  ncresults <- results(nc.Deseq)

  #Determining number of DEG's, fp rate, and specificity ,]
  ncresults <- ncresults[!is.na(ncresults$padj),]
  ncDEGs <- ncresults[ncresults$padj <= 0.05,]
  ncDEGsfound <- nrow(ncDEGs)

  false.positive.rate <- ncDEGsfound/nrow(NegativeControl)
  specificity <- (nrow(NegativeControl)-ncDEGsfound)/(nrow(NegativeControl))

  tempdf <- data.frame(spec = specificity, fprate = false.positive.rate, ndegs = nrow(ncDEGsfound))
  ncdf <- rbind(ncdf, tempdf)
}
write.csv(ncdf, "Comparative Analysis/ncresults/DESeq2.csv")


#Simulated Data####################################################################################################

path <- "Comparative Analysis/Datasets/"
files <- list.files(path)
files <- files[32:41]
finaldeseqdf <- data.frame()
for (i in 1:length(files)){
  print(files[i])
  SD <- read.delim(paste(path,files[i], sep = ""), header = TRUE, sep = ",")
  rownames(SD) <- SD$X
  SD <- SD[,-1]

  simcounts <- SD*100.0
  simcounts <- as.data.frame(simcounts)
  for(i in 1:ncol(simcounts)){
    simcounts[,i] <- as.integer(simcounts[,i])
  }

  filler <- matrix(c(rep("Group.1", 75), rep("Group.2", 75)), nrow = 150, dimnames = list(colnames(SD), 'Group'))
  DESeqobj <- DESeqDataSetFromMatrix(countData = simcounts+1, colData = filler, design = ~Group)
  sim.starttime <- Sys.time()
  sim.Deseq <-DESeq(DESeqobj)
  sim.endtime <- Sys.time()
  sim.runtime.DEseq <- sim.endtime - sim.starttime
  simresults <- results(sim.Deseq)

  expDEs <- simresults[is.na(simresults$padj) == FALSE,]
  expDEs <- rownames(expDEs[expDEs$padj <= 0.05,])

  trueDEs <- rownames(SD[1:2000,])
  trueEEs <- rownames(SD[2001:20000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i], DEGs = length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = as.double(sim.runtime.DEseq*60), "Returned" = nrow(simresults))

  finaldeseqdf <- rbind(finaldeseqdf, simdf)

}
write.csv(finaldeseqdf, "Comparative Analysis/EE_results/Deseq2.csv")



