#!/usr/bin/env ruby

#require 'bsearch'

Exon = Struct.new(:name, :gene, :coord)

class Gene
	attr_accessor :name
	attr_accessor :coord

	def initialize(n, c)
		@name = n
		@coord = c
	end

	def inspect
		[@name.inspect, @coord.inspect].join ("|")
	end

	def contains?(c)
		@coord.contains?(c)
	end
end

class Coord
	attr_accessor :chromosome
	attr_accessor :strand
	attr_accessor :interval

	def initialize(chr, st, startp, endp)
		@chromosome = chr
		@strand = (st == '+') ? true : false
		@interval = (startp.to_i)..(endp.to_i)
	end

	def contains?(coord)
		return false if coord.strand != strand || coord.chromosome != chromosome
		
		@interval.include?(coord.interval.first) && @interval.include?(coord.interval.last)
	end

	def update(startp, endp)
		unless startp > @interval.last || endp < @interval.first
			@interval = startp..@interval.last if startp < @interval.first
			@interval = @interval.first..endp if endp > @interval.last
			true
		else
			false
		end
	end

	def inspect
		[@chromosome.inspect, @strand.inspect, @interval.inspect].join("|")
	end
end


refseq_filename = ARGV.shift unless ARGV.empty?
exons_filename = ARGV.shift unless ARGV.empty?
out_filename = ARGV.shift unless ARGV.empty?

genes = Hash.new { |hash, key| hash[key] = Hash.new { |hash, key| hash[key] = [] }}
refseq_filename ||= File.join(File.dirname(__FILE__), '../input/refseq_genes/refSeq_hg18.txt')
$stderr.puts "Reading refseq genes from #{refseq_filename}"
IO.foreach(refseq_filename) do |line| 
	chr, st, startp, endp, name = line.chomp.split(/\t/)[1..-1]

	next unless chr =~ /^chr(\d|1\d|2(0|1|2)|X|Y)$/

	if genes[chr].include? name
		unless genes[chr][name].any? {|g| g.coord.update(startp.to_i, endp.to_i) }
			genes[chr][name] << Gene.new(name, Coord.new(chr, st, startp, endp))
		end 
	else
		genes[chr][name] << Gene.new(name, Coord.new(chr, st, startp, endp))
	end
end

sorted_genes = {}
genes.each do |chr, g|
	sorted_genes[chr] = g.values.flatten.sort { |g1, g2| g1.coord.interval.first <=> g2.coord.interval.first }
end	

sorted_genes.each do |chr, g|
	$stderr.puts [chr, g.size].join("\t")
end	

exons = Hash.new { |hash, key| hash[key] = {} }
first = true
$stderr.puts "Reading exons from #{exons_filename}"
File.open(out_filename, 'w') do |f| 
	IO.foreach(exons_filename) do |line| 
		if first
			first = false
			next
		end

		chromosome, strand, startp, endp = line.chomp.split(/\t/)

		next unless chromosome =~ /^chr(\d|1\d|2(0|1|2)|X|Y)$/

			exon_coord = Coord.new(chromosome, strand, startp, endp)

		gene_names = sorted_genes[chromosome].find_all {|g| g.coord.contains? exon_coord}
		next if gene_names.empty?

		gene_names.map! {|g| g.name}

		exon_ordinals = exons.values_at(*gene_names).map{|ex| ex.size}
		exon_names = gene_names.zip(exon_ordinals).map{|a| a.join('_')}
		exon_names.each_with_index do |exon_name, idx| 
			f.puts [chromosome, startp, endp, strand, gene_names[idx], exon_name].join("\t") 
		end

		gene_names.each_with_index do |gene_name, idx|  
			exon_name = exon_names[idx]
			exons[gene_name][exon_name] = Exon.new(exon_name, gene_name, exon_coord)
		end	
	end
end
