#! /usr/bin/env Rscript

args = commandArgs(trailingOnly=TRUE)
DEFAULT_SIGP <- 1
DO_BONFERRONI <- TRUE
DO_FISHER <- FALSE

if (length(args) == 0) {
    stop("You really need to supply an input file, friend.")
}
fname <- args[1]

chiSqrTest <- function(Matriz) {
    Xsq <- chisq.test(Matriz, correct=FALSE)
    Xsq$p.value
}

fishExaTest <- function(Matriz) {
    F <- fisher.test(Matriz)
    F$p.value
}

# Takes number of kmers, decides what Bonferroni corrected significance value should be.
getSigPBonferroni <- function(nk) {
    return(0.05 / nk)
}

filterByXsq <- function(counts, sigP) {
    cat('Max Xsq: ', max(counts$Xsq), '\n')
    lowerBoundXsq <- qchisq(sigP, 1, lower.tail=FALSE)
    cat('Initial Xsq filter lower bound:', lowerBoundXsq, '\n')
    counts <- counts[ counts$Xsq >= lowerBoundXsq, ]
    return(counts)
}
getSigP <- function(counts, DO_BONFERRONI) {
    # The number of k-mers is the same for every phenotype, so only need to count rows for one phenotype
    phen <- counts[1,]$phen
    nk <- nrow(counts[counts$phen == phen,])
    if (DO_BONFERRONI) {
        sigP <- getSigPBonferroni(nk)
        cat('Significant p-value (with Bonferroni correction):', sigP, '\n')
    }
    else {
        sigP <- DEFAULT_SIGP
        cat('Significant p-value (no Bonferroni):', sigP, '\n')
    }
    return(sigP)
}

# The format we want is :
#          Present       Absent
# Case
# Control
getRowMatriz <- function(r) {
    rcase <- c(r$casePresent, r$caseAbsent)
    rcontrol <- c(r$controlPresent, r$controlAbsent)
    Matriz = matrix(c(rcase, rcontrol), nrow=2, byrow=TRUE)
    rownames(Matriz) = c('case', 'control')
    colnames(Matriz) = c('present', 'absent')
    return(Matriz)
}

main <- function(counts, DO_BONFERRONI) {
    # Must calculate sigP before we filter because after we remove k-mers we cannot tell how many we started with.
    sigP <- getSigP(counts, DO_BONFERRONI)
    cat('Before filter: ', nrow(counts), '\n')
    # Initial filter by Xsq statistic
    counts <- filterByXsq(counts, sigP)
    cat('After filter: ', nrow(counts), '\n')
    if (nrow(counts) == 0) {
        quit()
    }
    counts$P_chi <- 1
    counts$log_P_chi <- 0
    if (DO_FISHER) {
        counts$P_fish <- 1
        counts$log_P_fish <- 0
    }
    # Get p-values for each row.
    for (i in 1:nrow(counts)) {
        if (counts[i,]$P_chi >= 0) {
            counts[i,]$P_chi <- pchisq(counts[i,]$Xsq, 1, lower.tail=FALSE)
        }
        else {
            Matriz <- getRowMatriz(counts[i,])
            counts[i,]$P_chi <- chiSqrTest(Matriz)
        }
        counts[i,]$log_P_chi <- 0 - log10(counts[i,]$P_chi)
        if (DO_FISHER) {
            Matriz <- getRowMatriz(counts[i,])
            counts[i,]$P_fish <- fishExaTest(Matriz)
            counts[i,]$log_P_fish <- 0 - log10(counts[i,]$P_fish)
        }
    }
    # Final filter by p-value
    counts <- counts[ counts$P_chi <= sigP, ]
    counts <- counts[ order(counts$phen, -counts$log_P_chi), ]
    write.table(counts, file=paste('important', nchar(as.character(counts[1,2])), '.txt', sep=''), row.names=FALSE, quote=FALSE, sep='  ')
}

counts <- read.table(fname, header=TRUE)
print(head(counts$Xsq))
if (nrow(counts) > 0) {
    main(counts, DO_BONFERRONI)
}

