suppressMessages( library(gdata) )

idrThresh <- 0.1
foldThresh <- 1.5

complexThresh <- 100

load('sep_samp_all_IDR_plus_DE_Results.RData')
complexity <- read.table('Supplementary_table_3_gene_complexity.tsv',
                         stringsAsFactors=FALSE, header=TRUE, row.names=1)
hnRNPs <- c('CG17838', 'elav', 'msi', 'mub', 'ps', 'qkr54B', 'qkr58E-1')

complexGenes <- row.names(complexity)[complexity[,'transcripts'] > 100]
exprGenes <- read.table('expr_genes.txt', stringsAsFactors=FALSE)[,1]
testableGenes <- intersect(row.names(complexity), exprGenes)

cat(length(complexGenes), ' reported in (...),',
    length(intersect(complexGenes, testableGenes)),
    'of which are found in 5.57 and',
    length(intersect(exprGenes, complexGenes)),
    'of which are expressed in S2 cells\n\n')

RBPs <- c("Cbp20", "CG6227", "Rm62", "snRNP-U1-70K", "U2af50", "Fmr1",
          "B52", "Rbp1", "SC35", "SF2", "Srp54", "tra2",
          "Syp", "elav", "msi", "mub", "ps", "qkr54B", "qkr58E-1",
          "Upf1")

boundAllIdr <- allIdrDE[allIdrDE[,'IDR'] < idrThresh &
                        pmin(allIdrDE[,'FC1'], allIdrDE[,'FC2']) > foldThresh,]

# complex gene section
CGnumBound <- rep(0, length(complexGenes))
names(CGnumBound) <- complexGenes
complexOverlap <- matrix(ncol=5)
for (RBP in RBPs){
    RBPSignifGenes <- intersect(sapply(strsplit(
        row.names(boundAllIdr)[startsWith(row.names(boundAllIdr), RBP)],
        '__'), function(x) x[2]), testableGenes)
    numRBPComplexGenes <- length(intersect(complexGenes, RBPSignifGenes))
    for (overlapGene in intersect(complexGenes, RBPSignifGenes)){
        CGnumBound[overlapGene] <- CGnumBound[overlapGene] + 1
    }
    complexOverlap <- rbind(
        complexOverlap,
        c(RBP, ifelse(RBP %in% hnRNPs, '*', ''),
          phyper(numRBPComplexGenes - 1, length(complexGenes),
                 length(testableGenes) - length(complexGenes),
                 length(RBPSignifGenes), lower.tail=FALSE),
          numRBPComplexGenes, length(RBPSignifGenes)))
}
complexOverlap <- complexOverlap[-1,]
colnames(complexOverlap) <- c(
    'RBP', 'is_hnRNP', 'Hypergeom_p_value', 'complex_genes_overlap',
    'num_RBP_signif_genes')
complexOverlap <- complexOverlap[order(as.numeric(complexOverlap[,3]),
                                       as.numeric(complexOverlap[,5])),]
# calculate hnRNP wilcox ranksum test
indicies <- 1:length(complexOverlap[,'RBP'])
cat('\nhnRNP wilcox ranksum test: ',
    wilcox.test(indicies[complexOverlap[,'RBP'] %in% hnRNPs],
                indicies[! complexOverlap[,'RBP'] %in% hnRNPs],
                alternative='less')$p.value, '\n\n')
write.table(complexOverlap, quote=FALSE, row.names=FALSE, sep='\t')
cat('\n\n')


# complex genes bound by section
CGBoundBy <- table(CGnumBound)
write.table(table(CGnumBound), row.names=FALSE, quote=FALSE, sep='\t')
numBoundGenes <- length(unique(sapply(strsplit(row.names(boundAllIdr), '__'),
                                   function(x) x[2])))
allFracBound <- numBoundGenes / length(exprGenes)
cgBoundFrac <- (sum(CGnumBound) - CGBoundBy['0']) / sum(CGnumBound)
cat('Fraction bound in total and by complex genes:',
    allFracBound, cgBoundFrac, '\n')
cat('CGs are ', 100 * ((cgBoundFrac / allFracBound) - 1),
    '% more likely to be bound by at least one RBP than expected at random (binomial p-value=',
    binom.test(sum(CGnumBound) - CGBoundBy['0'], sum(CGnumBound),
               p=allFracBound, alternative='greater')$p.value, ')\n')
cat('\n\n')


# unique binders section
numBoundRBPs <- table(sapply(strsplit(row.names(boundAllIdr), '__'),
                             function(x) x[2]))
boundGenes <- row.names(boundAllIdr)
rbpBinders <- split(
    sapply(strsplit(boundGenes, '__'), function(x) x[2]),
    sapply(strsplit(boundGenes, '__'), function(x) x[1]))
uniqueBinders <- matrix(ncol=2)
for (rbp in RBPs){
    if (rbp == 'Syp'){rbp <- 'CG17838'}
    uniqueBinders <- rbind(
        uniqueBinders,
        c(rbp, sum(sapply(rbpBinders[[rbp]],
                          function(geneId) numBoundRBPs[geneId] == 1)) * 100 /
          length(rbpBinders[[rbp]])))
}
uniqueBinders <- uniqueBinders[-1,]
colnames(uniqueBinders) <- c('RBP', 'Percent_Unique')

write.table(uniqueBinders, quote=FALSE, sep=',', row.names=FALSE,
            col.names=TRUE)

# calculate hnRNP wilcox ranksum test
indicies <- 1:length(uniqueBinders[,'RBP'])
cat('hnRNP wilcox ranksum test unique hits: ',
    wilcox.test(indicies[uniqueBinders[,'RBP'] %in% hnRNPs],
                indicies[! uniqueBinders[,'RBP'] %in% hnRNPs],
                alternative='greater')$p.value, '\n')


# 3' UTR section
cat('\n\n')
numGenes <- length(exprGenes)
rawUtrGenes <- read.table('eric_genes.conv.5.57.txt')[,1]
tpUtrGenes <- intersect(rawUtrGenes, exprGenes)
cat('number of 3\' UTR genes expressed in S2: ', length(tpUtrGenes),
    'out of :', length(rawUtrGenes), '\n')

tpOverlap <- matrix(ncol=4)
allRbps <- unique(sapply(strsplit(row.names(allIdrDE), '__'), function(x) x[1]))
rbpMats <- split(as.data.frame(allIdrDE), sapply(strsplit(row.names(allIdrDE), '__'), function(x) x[1]))
for (rbpMat in rbpMats){
    rbpHits <- sapply(strsplit(row.names(rbpMat), '__'), function(x)
                      x[2])[rbpMat[,'IDR'] < idrThresh &
                            pmin(rbpMat[,'FC1'], rbpMat[,'FC2']) > foldThresh]
    
    tpOverlap <- rbind(tpOverlap,
                       c(strsplit(row.names(rbpMat), '__')[[1]][1],
                         phyper(length(intersect(rbpHits, tpUtrGenes)) - 1,
                                length(tpUtrGenes),
                                numGenes - length(tpUtrGenes),
                                length(rbpHits), lower.tail=FALSE),
                         length(intersect(rbpHits, tpUtrGenes)),
                         length(rbpHits)))
}
tpOverlap <- tpOverlap[-1,]
colnames(tpOverlap) <- c('RBP', 'hyper_p_val', 'Num_intersect', 'numRBPBound')
tpOverlap <- tpOverlap[order(as.numeric(tpOverlap[,2])),]
write.table(tpOverlap, quote=FALSE, row.names=FALSE, sep='\t')

hnRNPPoss <- c(1:length(tpOverlap[,'RBP']))[which(tpOverlap[,'RBP'] %in% hnRNPs)]
cat('hnRNPs enriched for tp_utrs (wilcox test): ',
    wilcox.test(hnRNPPoss, c(1:length(tpOverlap[,'RBP']))[-hnRNPPoss],
                alternative='less')$p.value, '\n')


cat('\n\n')
# msi and elav specific section
msiHits <- intersect(sapply(strsplit(row.names(rbpMats[['msi']]), '__'), function(x)
                            x[2])[rbpMats[['msi']][,'IDR'] < idrThresh &
                                  pmin(rbpMats[['msi']][,'FC1'],
                                       rbpMats[['msi']][,'FC2']) > foldThresh],
                     tpUtrGenes)
elavHits <- intersect(sapply(strsplit(row.names(rbpMats[['elav']]), '__'), function(x)
                             x[2])[rbpMats[['elav']][,'IDR'] < idrThresh &
                                   pmin(rbpMats[['elav']][,'FC1'],
                                        rbpMats[['elav']][,'FC2']) > foldThresh],
                     tpUtrGenes)
cat('msi hits: ', length(msiHits), '\n')
cat('elav hits: ', length(elavHits), '\n')
cat('We find the msi significantly binds to ',
    length(setdiff(msiHits, elavHits)),
    ' genes with 3’ UTR extensions which elav does not.\n\n')


cat('\n\n')
# brings all tables together
complexOverlap[which(complexOverlap[,'RBP'] == 'Syp'),'RBP'] <- 'CG17838'
allTabs <- cbind(uniqueBinders[order(uniqueBinders[,1]), 2],
                 tpOverlap[order(tpOverlap[,1]), 2:3],
                 complexOverlap[order(complexOverlap[,1]), 3:4])
colnames(allTabs) <- c('Percent Unique', "3' UTR Enrichment P-value",
                       "Number of 3' UTR genes",
                       'Complex Gene Enrichment P-Value', 'Complex Genes Bound')
row.names(allTabs) <- sort(uniqueBinders[,1])
write.table(allTabs, quote=FALSE, row.names=TRUE, sep=',')

stop('Remove stop to calc up versus down calculations.')

# this is checking the fraction of hits that are up regulated in each sample
for (rbpMat in split(as.data.frame(allIdrDE), sapply(strsplit(row.names(allIdrDE), '__'), function(x) x[1]))){
    cat(strsplit(row.names(rbpMat)[1], '__')[[1]][1], '\t',
        sum(rbpMat[,'IDR'] < 0.1 & pmin(rbpMat[,'FC1'], rbpMat[,'FC2']) > 1.5) /
        sum(rbpMat[,'IDR'] < 0.1 & pmin(rbpMat[,'FC1'], rbpMat[,'FC2']) < 1/1.5),
        sum(rbpMat[,'IDR'] < 0.1 & pmin(rbpMat[,'FC1'], rbpMat[,'FC2']) > 1.5), '\t',
        sum(rbpMat[,'IDR'] < 0.1 & pmin(rbpMat[,'FC1'], rbpMat[,'FC2']) < 1/1.5),
        '\n')
}
