#This file tests edgeR on imbalanced data
setwd("~/Documents/DEGage_stuff/DEGage_Testing/Comparative Analysis/imbalanced_resutls/")
library(edgeR)

#testing
fs <- list.files("data/")
allsims <- data.frame()
for(f in fs){
  counts <- read.csv(paste("data/", f, sep = ""))
  rownames(counts) <- counts[,1]
  counts <- counts[,-1]
  t <- strsplit(f, "v")[[1]]
  t[2] <- substr(t[2], 1, nchar(t[2]) - 3)
  t <- as.numeric(t)
  for(j in 1:5){
    trial_id <- paste(t[1], "_", t[2], "_", j, sep = "")
    print(trial_id)

    group <- matrix(c(rep("Group1", t[1]),rep("Group2",t[2])),nrow = t[1]+t[2], dimnames = list(colnames(counts), 'Group'))
    group <-factor(group)

    list <-DGEList(counts)

    design <- model.matrix(~0+group)
    colnames(design) <- levels(group)

    sim.starttime <- Sys.time()

    AveLogCPM <-aveLogCPM(list)

    list <- calcNormFactors(list)
    print('dispersion')
    list <- estimateDisp(list, design, Robust = TRUE)
    print("ffit")
    fit <-glmQLFit(list, design, robust = TRUE)

    onev.two <-makeContrasts(Group1-Group2, levels = design)
    print("ftest")
    res <- glmQLFTest(fit, contrast = onev.two)

    sim.endtime <- Sys.time()
    sim.runtime <- difftime(sim.starttime, sim.endtime)

    topgenes <- as.data.frame(topTags(res, n =20000 ,adjust.method = "fdr",p.value = 0.05))
    print("topgenes"  )
    topgenes <- topgenes[topgenes$FDR < 0.05,]
    expDEs <- rownames(topgenes)

    trueDEs <- rownames(counts[1:2000,])
    trueEEs <- rownames(counts[2001:20000,])

    tp <- sum(trueDEs %in% expDEs)
    fp <- sum(trueEEs %in% expDEs)
    tn <- sum(!(trueEEs %in% expDEs))
    fn <- sum(!(trueDEs %in% expDEs))

    sim.sensitivity <- tp/(tp+fn)
    sim.specificity <- tn/(tn+fp)
    precision <- tp/(tp+fp)
    accuracy <- (tp +tn)/20000
    f1 <- (2*tp)/(2*tp+fp+fn)

    simdf <- data.frame("Trial" = trial_id,
                        DEGs= length(expDEs),
                        "Sensitivity" = sim.sensitivity,
                        "Specificity" = sim.specificity,
                        "Precision" = precision,
                        "Accuracy" = accuracy,
                        "F1" = f1,
                        g1 = t[1],
                        g2 = t[2])
    allsims <- rbind(allsims, simdf)
    print(simdf)

    res$trial_id <- trial_id
    res$g1 <- t[1]
    res$g2 <- t[2]

    fname <- paste("edger/", trial_id, ".csv", sep="")
    write.csv(topgenes, fname)
    write.csv(allsims, "edger/allsims.csv")
  }
}







