cat('Loading packages...\n')
suppressMessages( library(DESeq) )
suppressMessages( library(gdata) )
suppressMessages( library(ggplot2) )
suppressMessages( library(foreach) )
suppressMessages( library(doParallel) )

MAX.CORES <- 14
numSampReps <- 2
signifLevel <- 0.1

# read in all sample and control files
samples <- sapply(list.dirs('samples', recursive=FALSE),
                  function(x) strsplit(x, '/', fixed=TRUE)[[1]][2])

sampFiles <- list.files('samples', pattern='gene_counts',
                        full.names=TRUE, recursive=TRUE)

controlFiles <- list.files('controls', pattern='gene_counts', full.names=TRUE)
DEsign <- c(rep('control', length(controlFiles)), rep(samples, each=numSampReps))

rRNAGeneNames <- read.table('rRNA_gene_names.txt', stringsAsFactors=FALSE)[,1]

# prepare samples data structure
geneCountsRaw <- lapply(c(controlFiles, sampFiles),
                        function(sampF) read.table(sampF)[,2])
geneCounts <- do.call(cbind, geneCountsRaw)
row.names(geneCounts) <- read.table(sampFiles[1], stringsAsFactors=FALSE)[,1]
colnames(geneCounts) <- c(
    sapply(strsplit(controlFiles, "/"), function(x)
           strsplit(x[2], ".", fixed=TRUE)[[1]][1]),
    sapply(strsplit(sampFiles, "/"), function(x)
           strsplit(x[3], ".", fixed=TRUE)[[1]][1]))

# remove HTSeq accounting lines
htSeqIndices <- (dim(geneCounts)[1] - 4):dim(geneCounts)[1]
htCountLines <- geneCounts[htSeqIndices,]
# remove rRNA genes as this was a riboZero protocol
rRNAIndicies <- match(rRNAGeneNames, row.names(geneCounts))
rRNACounts <- geneCounts[rRNAIndicies,]

rmIndices <- c(htSeqIndices, rRNAIndicies)
geneCounts <- geneCounts[-rmIndices,]

# these are samples that are being removed from downstream analysis
# based on having few hits
rmSamps <- c('CG6841', 'Cnot4', 'eIF3-S4', 'RnpS1', 'RpS3',
             'Sqd', 'Sqd_hrp40', 'xl6', 'Smn')

rdDat <- data.frame(
    Read.Count=c(colSums(geneCounts), colSums(rRNACounts), colSums(htCountLines)),
    Source=factor(c(rep('Gene Regions', dim(geneCounts)[2]),
        rep('rRNA', dim(geneCounts)[2]),
        rep('Non-Gene and Ambiguous', dim(geneCounts)[2]))),
    Sample=factor(rep(colnames(geneCounts), 3), ordered=TRUE,
        levels=colnames(geneCounts)),
    Sample.Removed=rep(sapply(strsplit(colnames(geneCounts), "_"),
        function(x) ifelse(x[1] %in% rmSamps, 'Removed', 'Included')), 3))
pdf('plots/read_depth.pdf')
ggplot() + geom_bar(data=rdDat, stat='identity', width=0.5,
                    aes(x=Sample, y=Read.Count, colour=Sample.Removed)) +
    facet_grid(Source ~ .) + theme_bw() +
    theme(axis.text.x=element_text(size=4, angle=50, hjust=1))
foo <- dev.off()

# filter by RPKM values
geneLens <- read.table('gene_lengths.txt')[,2]
names(geneLens) <- read.table('gene_lengths.txt', stringsAsFactors=FALSE)[,1]
geneLens <- geneLens[-match(rRNAGeneNames, names(geneLens))]

subGeneCounts <- geneCounts[, ! DEsign %in% rmSamps]
sampTotalReads <- colSums(subGeneCounts)
RPKMs <- do.call(cbind, lapply(
    colnames(subGeneCounts), function(samp){
        scaleFactor <- 1000000000 / sampTotalReads[samp]
        return( subGeneCounts[,samp] * scaleFactor / geneLens )
    }))

# Filter all data so that only paper RBPs are used to estimate dispersions
samples <- samples[! samples %in% rmSamps]
geneCounts <- geneCounts[, ! DEsign %in% rmSamps]
DEsign <- DEsign[! DEsign %in% rmSamps]

cat('Analyzing ', length(samples), ' samples...\n')
registerDoParallel(cores=min(length(samples), MAX.CORES))

exprGenes <- row.names(geneCounts)[apply(RPKMs, 1, function(x) sum(x > 1) > 2)]
write.table(exprGenes, quote=FALSE, row.names=FALSE, col.names=FALSE,
            file='expr_genes.txt')

# require that at least two replicates show expression > RPKM 1 for analysis
geneCounts <- geneCounts[apply(RPKMs, 1, function(x) sum(x > 1) > 2),]
save(geneCounts, file='geneCounts.RData')

# first steps to run DESeq blindly without IDR
cds <- newCountDataSet(geneCounts, DEsign)
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds, method='pooled', sharingMode='maximum')
normCounts <- counts(cds, normalized=TRUE)

# store run values needed for plotting in auxilliary scripts
emptyOrFailed <- sapply(strsplit(controlFiles, "/"),
                        function(path) grepl("empty_vector", path[2]))
extras <- list(samples=samples, emptyOrFailed=emptyOrFailed,
               DEsign=DEsign, normCounts=normCounts)
save(extras, file='sep_samp_extras.RData')

cat('Starting foreach multiple precesses...\n')
allRes <- foreach(samp=samples, .packages=c('idr', 'DESeq')) %dopar% {
  # convert design to estimate dispersion using all samples besides samp
  # the replicates of samp are treated as separate samples and will be compared
  # by IDR for this sample
  sampDesign <- DEsign
  sampDesign[DEsign == samp] <- c(paste(samp, '_1', sep=''),
               paste(samp, '_2', sep=''))

  # create new count data set separating one set of replicates
  sampCds <- newCountDataSet(geneCounts, sampDesign)
  sampCds <- estimateSizeFactors(sampCds)
  sampCds <- estimateDispersions(sampCds)

  # calculate significant results without biological replicates
  sampRes1 <- nbinomTest(sampCds, 'control', paste(samp, '_1', sep=''))
  sampRes2 <- nbinomTest(sampCds, 'control', paste(samp, '_2', sep=''))

  # store -log p-values and remove invalid gene loci
  idrDatRaw <- matrix(c(
      -log10(sampRes1[,'padj']), -log10(sampRes2[,'padj']),
      sampRes1$foldChange, sampRes2$foldChange,
      sampRes1$baseMean, sampRes1$baseMeanA, sampRes1$baseMeanB,
      sampRes2$baseMean, sampRes2$baseMeanA, sampRes2$baseMeanB),
                      ncol=10)

  row.names(idrDatRaw) <- sampRes1$id
  # filter out NA and 0 NL p-values and samples where
  # either replicate has a down fold-change
  idrDat <- idrDatRaw[apply(
      idrDatRaw[,1:4], 1, function(x)
      ! (any(is.na(x)) || any(x[1:2] == 0) || any(x[3:4] < 1))),]
  if (samp %in% row.names(idrDat)){
      sampIndex <- match(samp, row.names(idrDat))
      idrDat <- idrDat[-sampIndex,]
  } else if (samp == 'CG17838' && 'Syp' %in% row.names(idrDat)){
      sampIndex <- match('Syp', row.names(idrDat))
      idrDat <- idrDat[-sampIndex,]
  }

  sampResIDR <- NA
  sampSignif <- NA
  if (! is.null(dim(idrDat)) && dim(idrDat)[1] > 1){
    # estimate IDR values for each gene and store result
    foo <- tryCatch({
      idrEst <- est.IDR(idrDat[,1:2], 1, 1, 0.5, 0.1, 0.01)
      sampResIDR <- idrEst$idr
      names(sampResIDR) <- row.names(idrDat)
      if (any(idrEst$idr <= signifLevel)){
        sampSignif <- row.names(idrDat)[idrEst$idr <= signifLevel]
      }
      cat('Finished', samp, '\n')
    }, error=function(mes){
      cat('IDR Error in', samp, '\n')
      return(NA)
    })
  } else {
    cat('No non-zero DESeq -log p-values', samp, '\n')
  }

  # return value for foreach functionality
  list(resDE=idrDat, resDERaw=idrDatRaw, resIDR=sampResIDR,
       signifGenes=sampSignif, samp=samp)
}

# create lists to store output of DE analysis
resDE <- vector(mode="list", length=length(samples))
names(resDE) <- samples
resDERaw <- vector(mode="list", length=length(samples))
names(resDERaw) <- samples
resIDR <- vector(mode="list", length=length(samples))
names(resIDR) <- samples
signifGenes <- vector(mode="list", length=length(samples))
names(signifGenes) <- samples

for (res in allRes){ 
  resDE[[res$samp]] <- res$resDE
  resDERaw[[res$samp]] <- res$resDERaw
  resIDR[[res$samp]] <- res$resIDR
  signifGenes[[res$samp]] <- res$signifGenes
}

save(resDE, file="sep_samp_DE_Results.RData")
save(resDERaw, file="sep_samp_DE_Results_Raw.RData")
save(resIDR, file="sep_samp_IDR_DE_Results.RData")
save(signifGenes, file="sep_samp_IDR_signif_genes.RData")

print(matrix(c(samples, unlist(lapply(resIDR, length)),
               unlist(lapply(signifGenes, length))), ncol=3,
             dimnames=list(c(), c('RBP', 'valid genes', 'signif genes'))))

cat('Running IDR on all samples together...\n')
# do IDR on all samples together (not including rmSamps)
allResDE <- matrix(ncol=4)
for (samp in names(resDE)){
    if (! is.null(dim(resDE[[samp]])) && dim(resDE[[samp]])[1] > 1){
        allResDE <- rbind(allResDE, resDE[[samp]][,1:4])
        row.names(allResDE)[(
            dim(allResDE)[1] - dim(resDE[[samp]])[1] + 1) : dim(allResDE)[1]] <-
                sapply(row.names(resDE[[samp]]),
                       function(geneName) paste(samp, geneName, sep='__'))
  }
}
# remove first empty row
allResDE <- allResDE[-1,]
# do IDR on all valid DESeq output together
allIdrEst <- est.IDR(allResDE[,1:2], 1, 1, 0.5, 0.1, 0.01)
allIdr <- allIdrEst$idr
names(allIdr) <- row.names(allResDE)
save(allIdr, file="sep_samp_all_IDR_DE_Results.RData")
allGlobalIdr <- allIdrEst$IDR
names(allGlobalIdr) <- row.names(allResDE)

# combine allIdr and resDE data structures
DEnames <- matrix(unlist(strsplit(row.names(allResDE), '__')),
                  ncol=2, byrow=TRUE)
allIdrDE <- matrix(ncol=12)
for (samp in names(resDE)){
    if (! is.null(dim(resDE[[samp]])) && dim(resDE[[samp]])[1] > 1){ 
        allIdrDE <- rbind(
            allIdrDE,
            cbind(allIdr[startsWith(names(allIdr), samp)],
                  resDE[[samp]][DEnames[DEnames[,1] == samp,2],],
                  allGlobalIdr[startsWith(names(allGlobalIdr), samp)]))
    }
}
allIdrDE <- allIdrDE[-1,]
colnames(allIdrDE) <- c('IDR', 'nlpadj1', 'nlpadj2', 'FC1', 'FC2',
                        'expr1','expr1empty', 'expr1samp',
                        'expr2', 'expr2empty', 'expr2samp', 'globalIDR')
save(allIdrDE, file='sep_samp_all_IDR_plus_DE_Results.RData')

# create signif genes from this structure
signifGenesAll <- vector(mode="list", length=length(resDE))
names(signifGenesAll) <- names(resDE)
for (samp in names(resDE)){
  if (! is.null(dim(resDE[[samp]])) && dim(resDE[[samp]])[1] > 1){
    # get sample gene names for this sample which are significant at a 5% idr
    sampGenes <- strsplit(row.names(allResDE)[(
      startsWith(row.names(allResDE), samp) & allIdr <= signifLevel)], '__')
    # get all gene names out of the list of split strings
    if (length(sampGenes) > 0){
      signifGenesAll[[samp]] <- unlist(
        sampGenes)[seq(2, (length(sampGenes) * 2), 2)]
    }
  }
}
save(signifGenesAll, file='sep_samp_all_IDR_signif_genes.RData')

cat('Plotting IDR Results...\n')
# get points near cutoff from total IDR for plotting
eps <- 0.01
cutPoints <- allResDE[abs(allIdr - signifLevel) < eps, 1:2]
cutPoints <- as.data.frame(cutPoints[cutPoints[,1] > cutPoints[,2],])
colnames(cutPoints) <- c('rep1', 'rep2')
fit <-  loess(rep1 ~ rep2, cutPoints)
cutDatLower <- data.frame(x=fit$x[order(fit$x)], y=fit$y[order(fit$x)])
cutDatUpper <- data.frame(x=fit$y[order(fit$x)], y=fit$x[order(fit$x)])

plotIDR <- function(dat, idrVals, title){
  gdat <- data.frame(
      rep1=dat[,1], rep2=dat[,2], IDR=idrVals)
  pTmp <- (ggplot() + geom_point(
      aes(x=rep1, y=rep2, colour=IDR), data=gdat, alpha=0.2) +
           xlab('Replicate 1 -log10 DESeq P-Value') +
           ylab('Replicate 2 -log10 DESeq P-Value') +
           scale_colour_gradient2(midpoint=signifLevel, space='Lab',
                                  high='black', mid='tomato4', low='red') +
           labs(title=title) + theme_bw() +
           scale_y_continuous(limits = c(0, 100)) +
           scale_x_continuous(limits = c(0, 100)) +
           geom_line(data=cutDatLower, aes(x=x, y=y), colour='blue') +
           geom_line(data=cutDatUpper, aes(x=x, y=y), colour='blue'))
}

# plot all IDR points and then sample by sample IDR with cutoff points
pdf('plots/IDR_diagnostic_plots.pdf')
print(plotIDR(allResDE, allIdr, 'All Samples'))
for (samp in names(resDE)){
  if ((! is.null(dim(resDE[[samp]])) && dim(resDE[[samp]])[1] > 1 &&
       ! is.na(resIDR[[samp]])) && any(resIDR[[samp]] < signifLevel)){
    print(plotIDR(resDE[[samp]][,1:2],
                  resIDR[[samp]], samp))
  }
}
foo <- dev.off()
