load('sep_samp_all_IDR_plus_DE_Results.RData')
suppressMessages( library(gdata) )
suppressMessages( library(poibin) )

nTotalGenes <- 16894
signifLevel <- 0.1
foldCutoff <- 1.5
hsNumber <- 10

RBPs <- c("Cbp20", "CG6227", "Rm62", "snRNP-U1-70K", "U2af50", "Fmr1",
          "B52", "Rbp1", "SC35", "SF2", "Srp54", "tra2",
          "CG17838", "elav", "msi", "mub", "ps", "qkr54B", "qkr58E-1",
          "Upf1")

boundIdrDE <- allIdrDE[allIdrDE[,'IDR'] < signifLevel &
                       pmin(allIdrDE[,'FC1'], allIdrDE[,'FC2']) > foldCutoff,]
numBoundRBPs <- table(sapply(strsplit(row.names(boundIdrDE), '__'),
                             function(x) x[2]))
hsGenes <- names(numBoundRBPs)[numBoundRBPs >= hsNumber]
numBoundGenes <- sum(numBoundRBPs > 0)
exprGenes <- read.table('expr_genes.txt', stringsAsFactors=FALSE)[,1]
numExprGenes <- length(exprGenes)

cat('\nwe recover an average of ',
    floor(mean(table(sapply(strsplit(row.names(boundIdrDE), '__'),
                            function(x) x[1])))),
    ' bound targets per RBP\n')
cat('target ', round(100 * (numBoundGenes / numExprGenes), 2),
    '% of genes expressed in S2 cells and ',
    round(100 * (numBoundGenes / nTotalGenes), 2), '% of all genes in fly\n')
cat('The RNAs of', length(hsGenes), 'genes are bound by half or',
    ' more of the RBPs from this study\n\n')
write.table(hsGenes, quote=FALSE, row.names=FALSE, col.names=FALSE,
            file='hotspot_genes.txt')
cat('RBPs that are hotspots: ', intersect(RBPs, hsGenes), '\n')

PVals <- list()
for (samp in RBPs){
    numSampHits <- sum(startsWith(row.names(boundIdrDE), samp))
    numSampHS <- sum(sapply(strsplit(row.names(boundIdrDE)[startsWith(
        row.names(boundIdrDE), samp)], '__'),
                            function(x) ifelse(x[2] %in% hsGenes, 1, 0)))
    sampPVal <- phyper(numSampHS - 1, numSampHits,
                       numBoundGenes - numSampHits, length(hsGenes),
                       lower.tail=FALSE)
    PVals[[samp]] <- sampPVal
}
cat('Overlap of sample hits with hsRNAs\n\n')
write.table(sort(unlist(PVals)), quote=FALSE, sep='\t')

indicies <- 1:length(RBPs)
hnRNPs <- c('CG17838', 'elav', 'msi', 'mub', 'ps', 'qkr54B', 'qkr58E-1')
cat("hnRNP and quaking related RBPs contribute much less significantly  to ",
    "HOT RNA GO term enrichments (rank rum p-value ",
    wilcox.test(indicies[names(sort(unlist(PVals))) %in% hnRNPs],
                indicies[! names(sort(unlist(PVals))) %in% hnRNPs],
                alternative='greater')$p.value, '\n')
cat("at least one hnRNP or quaking related protein targets ",
    round(length(
        unique(sapply(strsplit(row.names(boundIdrDE), '__'), function(x) x[2])[sapply(
            strsplit(row.names(boundIdrDE), '__'), function(x)
            x[1] %in% hnRNPs && x[2] %in% hsGenes)])) * 100 / length(hsGenes), 2),
    "% of HOT RNAs\n")

# poisson binomial bound genes
boundGenes <- row.names(boundIdrDE)
numGenes <- length(unique(sapply(strsplit(boundGenes, '__'), function(x) x[2])))
ps <- table(sapply(strsplit(boundGenes, '__'), function(x) x[1])) / numGenes

numRNAs <- table(numBoundRBPs)
cat('\nMost bound hotspot genes (', max(numBoundRBPs), '): ',
    names(numBoundRBPs)[numBoundRBPs == max(numBoundRBPs)], '\n',
    'hotspot genes (', max(numBoundRBPs) - 1, '): ',
    names(numBoundRBPs)[numBoundRBPs == max(numBoundRBPs) - 1],'\n\n')
genesBound <- rep(0, length(ps) + 1)
genesBound[as.integer(names(numRNAs)) + 1] <- numRNAs
genesBound[1] <- length(setdiff(
    unique(sapply(strsplit(row.names(allIdrDE), '__'), function(x) x[2])),
    unique(sapply(strsplit(row.names(boundIdrDE), '__'), function(x) x[2]))))

cat('Hotspot existence table: \n')
basePVals <- ppoibin((0:length(ps))-1, ps)
numBoundCumSum <- rev(cumsum(rev(genesBound)))

# simulate overlapped binding and calculate gaussian approximated p-values
poibinSamps <- lapply(1:2000, function(foo) rpoibin(numGenes, ps))
zVals <- sapply(0:max(numBoundRBPs), function(i){
    boundSamp <- sapply(poibinSamps, function(samps) sum(samps >= i))
    trueP <- ppoibin(i, ps)
    av <- mean(boundSamp)
    sd <- sd(boundSamp)
    # these calculated avg and sd numbers are wrong cus I don't
    # understand this distribution (so we simulate)
    #avCalc <- numGenes * trueP
    #sdCalc <- sqrt(numGenes * trueP * (1 - trueP))
    #cat((numBoundCumSum[(i+1)] - av) / sd, '\t',
    #    (numBoundCumSum[(i+1)] - avCalc) / sdCalc, '\n')
    return((numBoundCumSum[(i+1)] - av) / sd)
})
gausApprox <- pnorm(zVals, lower.tail=FALSE)
write.table(data.frame(
    'Number of RBPs Bound'=0:max(numBoundRBPs),
    'Number of RNAs Bound'=genesBound[1:(max(numBoundRBPs)+1)],
    'Prob Single RNA Bound by X RBPs or more'=(1-basePVals)[1:(max(numBoundRBPs)+1)],
    'Prob Max RNA Bound by X RBPs or more'=1-(basePVals ^ numGenes)[1:(max(numBoundRBPs)+1)],
    'Prob N RNAs Bound by X RBPs or more.Approx'=gausApprox),
            quote=FALSE, sep='\t', row.names=FALSE, col.names=TRUE)

cat('\nChance that a single RNA is bound by one or zero RIP experiments: ',
    ppoibin(1, ps),
    '\nChance that a single RNA is bound by zero RIP experiments: ',
    ppoibin(0, ps), '\n\n')

cat('an ', round(length(hsGenes) / 1-(ppoibin(11, ps)^numGenes)),
    '-fold enrichment over expectation ',
    '(Poisson-binomial p-value ', (1-(ppoibin(11, ps)^numGenes))^length(ps),
    ')\n\n')


# table of all pairwise overlaps
rbpBinders <- split(
    sapply(strsplit(boundGenes, '__'), function(x) x[2]),
    sapply(strsplit(boundGenes, '__'), function(x) x[1]))
allPairwiseOverlaps <- matrix(ncol=6)
for (i in 1:(length(RBPs) - 1)){
    for (j in (i+1):length(RBPs)){
        numOvlp <- length(intersect(
            rbpBinders[[RBPs[i]]], rbpBinders[[RBPs[j]]]))
        pVal <- phyper(numOvlp - 1, length(rbpBinders[[RBPs[i]]]),
                       numExprGenes - length(rbpBinders[[RBPs[i]]]),
                       length(rbpBinders[[RBPs[j]]]), lower.tail=FALSE)
        allPairwiseOverlaps <- rbind(
            allPairwiseOverlaps,
            c(RBPs[i], RBPs[j], pVal, numOvlp, length(rbpBinders[[RBPs[i]]]),
              length(rbpBinders[[RBPs[j]]])))
    }
}
allPairwiseOverlaps <- allPairwiseOverlaps[-1,]
colnames(allPairwiseOverlaps) <- c('RBP1', 'RBP2', 'Hypergeometric.P.Value',
                                    'Overlap', 'numRNAs1', 'numRNAs2')
allPairwiseOverlaps <- allPairwiseOverlaps[order(
    as.numeric(allPairwiseOverlaps[,'Hypergeometric.P.Value'])),]
write.table(allPairwiseOverlaps, quote=FALSE, sep=',', row.names=FALSE,
            file='allPairwiseOverlaps.csv')

uniqueBinders <- matrix(ncol=2)
for (rbp in RBPs){
    uniqueBinders <- rbind(
        uniqueBinders,
        c(rbp, sum(sapply(rbpBinders[[rbp]],
                          function(geneId) numBoundRBPs[geneId] == 1)) * 100 /
          length(rbpBinders[[rbp]])))
}
uniqueBinders <- uniqueBinders[-1,]
colnames(uniqueBinders) <- c('RBP', 'Percent_Unique')
uniqueBinders <- uniqueBinders[order(uniqueBinders[,'Percent_Unique']),]

write.table(uniqueBinders, quote=FALSE, sep=',', row.names=FALSE,
            col.names=TRUE)

# calculate hnRNP wilcox ranksum test
cat('hnRNP wilcox ranksum test unique hits: ',
    wilcox.test(indicies[uniqueBinders[,'RBP'] %in% hnRNPs],
                indicies[! uniqueBinders[,'RBP'] %in% hnRNPs],
                alternative='greater')$p.value, '\n')
