#!/usr/bin/env ruby

require 'set'
require 'hypergeometric'

bg_filename = ARGV.shift unless ARGV.empty?
sf_filename = ARGV.shift unless ARGV.empty?
out_filename = ARGV.shift unless ARGV.empty?

bg = Set.new
event_exon_map = Hash.new { |hash, key| hash[key] = Set.new }

# 0. load background which is all possible events
#    this is "event_mapping.txt"
$stderr.puts "Reading background from #{bg_filename}"
IO.foreach(bg_filename) do |line| 
	exon, event= line.chomp.split(/\t/)
	bg << exon
	event_exon_map[event] << exon
end
$stderr.write( "BG: #{bg.size}\n" )

# 1. load the SF - event list
#    this is "dror_interactions.txt"

$stderr.puts "Reading all SF-protein interaction based on CLIP data from #{sf_filename}"
sf_interactions = Hash.new { |hash, key| hash[key] = Set.new }
IO.foreach(sf_filename) do |line| 
	exon, sf = line.chomp.split(/\t/).values_at(0,5)
	sf_interactions[sf] << exon if bg.include? exon
end
$stderr.write( "SF interactions: #{sf_interactions.size}\n" )

# 2. load all differentially spliced events
diff_sp_genes_filename = File.join(File.dirname(__FILE__), '../input/cancerDiffAS.txt')

tissue_exons = Hash.new { |hash, key| hash[key] = Set.new }
IO.foreach(diff_sp_genes_filename) do |line| 
	tissue, event_name = line.chomp.split(/\t/).values_at(0,2)
	tissue_exons[tissue].merge(event_exon_map[event_name]) if event_exon_map.include? event_name
end
$stderr.write( "Tissues events: #{tissue_exons.keys.inspect}\n" )
$stderr.write( "Tissues events: #{tissue_exons.size}\n" )

results = []

# 3. rank SFs according to hypergeometric score
tissue_exons.each do |name, exons|
	sf_interactions.each do |sf, sf_exons|
		score = Math.hypergeometric_score( (exons & sf_exons).size, exons.size, sf_exons.size, bg.size)
		results << [name, sf, score] if score < 0.05
	end	
end	
$stderr.write( "Results: #{results.size}\n" )

$stderr.puts "Writing results to #{out_filename}"
File.open(out_filename, 'w') do |f| 
	results.sort { |a,b| a[0] <=> b[0] }.each do |record|
		f.puts record.join("\t")
	end	
end
