#!/usr/bin/env ruby

require 'set'
require 'bsearch'
require 'optparse'
require 'peach'
require 'returning'

out_filename = nil
@window_size = 300
OptionParser.new do |opts|
	opts.banner = "Usage: #{File.basename($PROGRAM_NAME)} -f output_filename"
	
	opts.on( '-w', '--window-size WINDOWSIZE', Integer,
	         'Determines the window size to consider overlaps') do |w|
		@window_size = w
	end
	opts.on( '-f [FILENAME]', '--filename', String,
	         'Output filename') do |opt|
		out_filename = opt
	end
end.parse!

Exon = Struct.new(:name, :gene, :coord )

class Gene
	attr_accessor :name
	attr_accessor :coord

	def initialize(n, c)
		@name = n
		@coord = c
	end

	def inspect
		[@name.inspect, @coord.inspect].join ("|")
	end

	def contains?(c)
		@coord.contains?(c)
	end
end

class Coord
	attr_accessor :chromosome
	attr_accessor :strand
	attr_accessor :interval

	def initialize(chr, st, startp, endp)
		@chromosome = chr
		@strand = (st == '+') ? true : false
		@interval = (startp.to_i)..(endp.to_i)
	end

	def contains?(coord)
		return false if coord.strand != strand || coord.chromosome != chromosome
		
		@interval.include?(coord.interval.first) && @interval.include?(coord.interval.last)
	end

	def update(startp, endp)
		@interval = startp..@interval.last if startp < @interval.first
		@interval = @interval.first..endp if endp > @interval.last
	end

	def format
		[chromosome, interval.first, interval.last, strand ? '+' : '-']
	end
end

def read_interaction_file(file1, sf, interactions, columns, *score_args)
	score_column = score_args.shift unless score_args.empty?
	cutoff = score_args.shift unless score_args.empty?

	header = true
	IO.foreach(file1) do |line| 
		if header	
			header = false
			next
		end
		records = line.chomp.split(/\t/)
		chr, start, stop, strand, symbol, score = records.values_at(*columns)
		start = start.to_i
		stop = stop.to_i
	
		if block_given?
			next unless yield(records[score_column].to_i)
		else
			next if records[score_column].to_i > cutoff unless score_column.nil?
		end

		candidates = @exons[chr].find_all { |exon| exon.coord.interval.first - @window_size <= start && exon.coord.interval.last + @window_size >= stop}
	
		candidates.each do |candidate|
			if sf.is_a? Fixnum
				interactions << [records[sf], candidate, start, stop, symbol]
			else
				interactions << [sf, candidate, start, stop, symbol]
			end
		end	
	end
end

exons_filename = ARGV.shift unless ARGV.empty?

exons_filename ||= File.join(File.dirname(__FILE__), '../input/exons_hg18_names.txt' )
@exons = Hash.new { |hash, key| hash[key] = [] }
IO.foreach(exons_filename) do |line| 
	chr, start, stop, strand, gene, name = line.chomp.split(/\t/)
	@exons[chr] << Exon.new(name, gene, Coord.new(chr, strand, start, stop))
end


interactions = Set.new

####################################################################### 
file1 = File.join(File.dirname(__FILE__), '../input/SFRS1clip.txt')
#read_interaction_file(file1, 'SFRS1', interactions, [0,1,2,3,-2])

####################################################################### 
file2 = File.join(File.dirname(__FILE__), '../input/PTBP1clip.txt' )
#read_interaction_file(file2, 'PTBP1', interactions, [0,1,2,3,-2], 5, 18)

####################################################################### 
file3 = File.join(File.dirname(__FILE__), '../input/RBM9clip.txt')
#read_interaction_file(file3, 'RBM9', interactions, [0,1,2,3,-3], -2, 275.8)

####################################################################### 
file4 = File.join(File.dirname(__FILE__), '../input/doRiNA240512.txt')
#read_interaction_file(file4, 0, interactions, [3,4,5,6,1], -2, 95) {|score| score <= 5}

####################################################################### 
thresholds = {
	"../input/HnRNPA1_comb_trim_ingenes_clusters_hg18150.bed" => 25,
	"../input/HnRNPA2B1_comb_trim_ingenes_clusters_hg18150.bed" => 13,
	"../input/HnRNPF_comb_trim_ingenes_clusters_hg18150.bed" => 13,
	"../input/HnRNPH_Burge_comb_trim_ingenes_clusters_hg18150.bed" => 47,
	"../input/HnRNPM_comb_trim_ingenes_clusters_hg18150.bed" => 45,
	"../input/HnRNPU_comb_trim_ingenes_clusters_hg18150.bed" => 14    
}

params = [
	[file1, 'SFRS1', interactions, [0,1,2,3,-2], nil, nil],
	[file2, 'PTBP1', interactions, [0,1,2,3,-2], 5, 18],
	[file3, 'RBM9', interactions, [0,1,2,3,-3], -2, 275.8],
]

Dir['../input/HnRNP*'].each do |file5|
	# we ignore the symbol here
	$stderr.puts "!!!!! PROBLEM: filenames do not match" unless thresholds.include? file5
	name = File.basename(file5).split('_').first
	name[0] = name[0].downcase
	params << [file5, File.basename(file5).split('_').first, interactions, [0,1,2,5,0], 4, thresholds[file5]] 
end	

$stderr.puts "starting with files"
Cores = 20
interactions = params.pmap(Cores) do |args| 
	$stderr.puts "start with #{args[0]}"
	args[2] = Set.new
	read_interaction_file(*args)
	$stderr.puts "done with #{args[0]}"
	args[2]
end.reduce {|memo, new| memo.merge(new)}.to_set
read_interaction_file(file4, 0, interactions, [3,4,5,6,1], -2, 95) {|score| score <= 5}

####################################################################### 

$stderr.write("Writing output to #{out_filename}\n")
File.open(out_filename, 'w') do |f| 
	interactions.each do |i|
		splicing_factor = i[0]
		exon = i[1]
		interaction_start = i[-3]
		interaction_stop = i[-2]
	
		annotation = case 
			when exon.coord.interval.first > interaction_stop
				exon.coord.strand == true ? "before" : "After"
			when exon.coord.interval.first > interaction_start 
				exon.coord.strand == true ? "start" : "End"
			when exon.coord.interval.last < interaction_start
				exon.coord.strand == true ? "after" : "Before"
			when exon.coord.interval.last < interaction_stop
				exon.coord.strand == true ? "end" : "Start"
			else
				"contained"
		end
	
		puts [exon.name, exon.coord.format, splicing_factor, interaction_start, interaction_stop, annotation].flatten.join("\t")
		f.write([splicing_factor, exon.name, 0.6, 1].join("\t"))
		f.write("\n")
		f.write([exon.name, exon.gene, 0.6, 1].join("\t"))
		f.write("\n")
	end	
end
