library('edgeR')

# This file performs the comparative analysis for edgeR on real positive
# controls, negative controls, and simulated data

### Positive Controls real data

PositiveControl <-read.delim("Comparative Analysis/PositiveControl.csv", header = TRUE, sep = ",")
rownames(PositiveControl) <-PositiveControl[,1]
PositiveControl <- PositiveControl[,-1]
Goldstandard <-read.delim("Comparative Analysis/goldstandard_top1000DEGs.txt", header = TRUE, sep = "\n")
PositiveControl <- PositiveControl[-1:-8,-1:-6]
PositiveControl <- PositiveControl[substr(rownames(PositiveControl),1,2) != "r_",]
PositiveControl <- PositiveControl[,-93:-96]

#Creation of edgeR object
group <- matrix(c(rep("Stem.Cells", 48),rep("Fibroblasts",44)),nrow = 92, dimnames = list(colnames(PositiveControl), 'Group'))
group <-factor(group)

list <-DGEList(PositiveControl)

design <- model.matrix(~0+group)
colnames(design) <- levels (group)

#DEG Analysis
pc.starttime <- Sys.time()

AveLogCPM <-aveLogCPM(list)
list <- calcNormFactors(list)
list <- estimateDisp(list, design, Robust = TRUE)
fit <-glmQLFit(list, design, robust = TRUE)

onev.two <-makeContrasts(Stem.Cells-Fibroblasts, levels = design)

res <- glmQLFTest(fit, contrast = onev.two)

pc.endtime <- Sys.time()
pc.runtime <- pc.endtime - pc.starttime

topgenes <- as.data.frame(topTags(res, n =20000 ,adjust.method = "fdr",p.value = 0.05))
pc.ngenes <-nrow(topgenes)
genenames <- rownames(topgenes)

#get degs in goldstandard set
samegenes <- c("")
for(i in genenames){
  if (any(Goldstandard == i)){
    samegenes <- c(samegenes, i)
  }
}
sensitivity <- length(samegenes)/nrow(Goldstandard)
sensitivity


### Negative Controls - This follows standard workflow, without filtering of low counts
path <- "Comparative Analysis/Datasets/"
ncfiles <- list.files(path)
ncfiles <- ncfiles[grep("NC", ncfiles)]
ncdf <- data.frame()
for(i in 1:length(ncfiles)){
  print(i)
  ncpath <- paste(path, "NC", i, ".csv", sep = "")
  NegativeControl <-read.csv(ncpath, header = TRUE, sep = ",", skipNul = TRUE, encoding = "UTF-8")
  rownames(NegativeControl) <- make.unique(NegativeControl$X)
  NegativeControl <- NegativeControl[,-1]
  group <- matrix(c(rep("Group1", 38), rep("Group2", 38)), nrow = 76, dimnames = list(colnames(NegativeControl), 'Group'))
  group <-factor(group)
  list <-DGEList(NegativeControl)

  design <- model.matrix(~0+group)
  colnames(design) <- levels (group)

  #DEG Analysis
  nc.starttime <- Sys.time()

  keep <- filterByExpr(list, design)
  list <- list[keep, , keep.lib.sizes=FALSE]
  AveLogCPM <-aveLogCPM(list)
  list <- calcNormFactors(list)
  list <- estimateDisp(list, design, Robust = TRUE)
  fit <-glmQLFit(list, design, robust = TRUE)

  onev.two <-makeContrasts(Group1-Group2, levels = design)

  res <- glmQLFTest(fit, contrast = onev.two)

  is.de <- decideTestsDGE(res)
  final.results <- summary(is.de)

  nc.endtime <- Sys.time()
  nc.runtime <- nc.endtime - nc.starttime

  #Analyzing results and determining false positive rate, specificity, and total DEG's detected in negative control
  nc.DEGs <- final.results[1]+final.results[3]
  specificity <- (nrow(NegativeControl)-nc.DEGs)/(nrow(NegativeControl))
  false.positive.rate <- nc.DEGs/(nrow(NegativeControl))
  tempdf <- data.frame(spec = specificity, fprate = false.positive.rate, ndegs = nc.DEGs)
  ncdf <- rbind(ncdf, tempdf)
}
write.csv(ncdf, "Comparative Analysis/ncresults/edger.csv")


#Simulated Data
path <- "Comparative Analysis/Datasets/"
files <- list.files(path)
files <- files[32:41]
finaledgerdf <- data.frame()
for (i in 1:length(files)){
  print(files[i])
  SD <- read.delim(paste(path,files[i], sep = ""), header = TRUE, sep = ",")
  rownames(SD) <- SD$X
  SD <- SD[,-1]

  group <- matrix(c(rep("Group1", 75),rep("Group2",75)),nrow = 150, dimnames = list(colnames(SD), 'Group'))
  group <-factor(group)

 list <-DGEList(SD)

  design <- model.matrix(~0+group)
  colnames(design) <- levels (group)

  #DEG Analysis
  sim.starttime <- Sys.time()

  AveLogCPM <-aveLogCPM(list)
  list <- calcNormFactors(list)
  list <- estimateDisp(list, design, Robust = TRUE)
  fit <-glmQLFit(list, design, robust = TRUE)

  onev.two <-makeContrasts(Group1-Group2, levels = design)

  res <- glmQLFTest(fit, contrast = onev.two)

  sim.endtime <- Sys.time()
  sim.runtime <- difftime(sim.starttime, sim.endtime)

  #Analyzing results. Total number of DEG's are found. Sensitivity is found by comparing DEG list to goldstandard list
  topgenes <- as.data.frame(topTags(res, n =20000 ,adjust.method = "fdr",p.value = 0.05))


  expDEs <- rownames(topgenes)

  trueDEs <- rownames(SD[1:2000,])
  trueEEs <- rownames(SD[2001:20000,])

  tp <- sum(trueDEs %in% expDEs)
  fp <- sum(trueEEs %in% expDEs)
  tn <- sum(!(trueEEs %in% expDEs))
  fn <- sum(!(trueDEs %in% expDEs))

  sim.sensitivity <- tp/(tp+fn)
  sim.specificity <- tn/(tn+fp)
  precision <- tp/(tp+fp)
  accuracy <- (tp +tn)/20000
  f1 <- (2*tp)/(2*tp+fp+fn)

  simdf <- data.frame("Trial" = files[i], "DEGs"= length(expDEs), "Sensitivity" = sim.sensitivity, "Specificity" = sim.specificity, "Precision" = precision,
                      "Accuracy" = accuracy, "F1" = f1, "Runtime" = as.double(sim.runtime*60))

  finaledgerdf <- rbind( finaledgerdf, simdf)
}
write.csv(finaledgerdf , "Comparative Analysis/EE_results/edger.csv")

