#!/usr/bin/env ruby

require 'gene_info'
require 'returning'
require 'nokogiri'
require 'mean'
require 'set'

def total_network_size(filename, anchor_name)
	total = 0
	IO.foreach(filename) do |line| 
		confidence = line.chomp.split(/\s/)[2]
		total += -Math.log(confidence.to_f)
	end

	total
end

def read_network(filename)
	returning(Hash.new { |hash, key| hash[key] = {} }) do |network|
		IO.foreach(filename) do |line| 
			from, to, confidence = line.chomp.split(/\s+/).first(3)
			confidence = 1
			network[from][to] = confidence
			network[to][from] = confidence
		end
	end
end

def shortest_paths(from, network)
	returning(Hash.new(Float::INFINITY)) do |distances|
		distances[from] = 0
		queue = network.keys

		until queue.empty? 
			u = queue.min_by {|v| distances[v] }
			queue.delete(u)

			break if u == Float::INFINITY

			network[u].each do |v, d|
				alt = distances[u] + d
				distances[v] = alt if alt < distances[v] 
			end	
		end
	end
end

def max_distance_from_anchor(filename, anchor_name)
	network = read_network(filename)
	shortest_paths(anchor_name, network).values.max
end

def mean_distance_from_anchor(filename, anchor_name)
	network = read_network(filename)
	shortest_paths(anchor_name, network).values.mean
end

def num_nodes(filename, anchor_name)
	network = read_network(filename)
	network.delete_if {|k,v| !@gi.include?(k.to_i) } # remove terminals (these would have invalid gene names)
	network.size
end

def num_edges(filename, anchor_name)
	network = read_network(filename)
	network.delete_if {|k,v| !@gi.include?(k.to_i) } # remove terminals (these would have invalid gene names)
	network.map{|k,v| v.delete_if{|i| !@gi.include?(i.to_i)}; v.size}.reduce(:+) / 2	
end

def jaccard_nodes(filename, anchor_name)
	network = read_network(filename)	
	network.delete_if {|k,v| !@gi.include?(k.to_i) } # remove terminals (these would have invalid gene names)

	tissue = read_network(filename.split('.').values_at(0,-1).join('.'))
	tissue.delete_if {|k,v| !@gi.include?(k.to_i) } # remove terminals (these would have invalid gene names)

	s1 = Set.new(network.keys)
	s2 = Set.new(tissue.keys)

	(s1 & s2).size.to_f / (s1 | s2).size
end

def network_expression(filename, anchor_name)
	anchor_name = (@gi.include?(anchor_name.to_i) ? @gi[anchor_name.to_i].default_id : anchor_name)
	genes = Set.new
	IO.foreach(filename) do |line| 
		from, to = line.chomp.split(/\s/).first(2).map(&:to_i)
		next if from ==0 || to == 0
		genes << (@gi.include?(from) ? @gi[from].default_id : from)
		genes << (@gi.include?(to) ? @gi[to].default_id : to)
	end

	tissue = filename.match(/_([^_]+)$/)[1].downcase + 'diff'
	score = []
	genes.each do |g|
		next if g == anchor_name
		if @expression.include?(g) && !@expression[g][tissue].nil?
			score << @expression[g][tissue].abs 
		end
	end	
	return score.join('|')
end

################################
def edge_name(from, to)
	(from.to_s < to.to_s) ? "#{from}_#{to}" : "#{to}_#{from}"
end

def add_edge(edges, from, to, confidence, directed)
	returning edge_name(from, to) do |edge_str|
		edges[edge_str]['source'] = from
		edges[edge_str]['target'] = to
		edges[edge_str]['confidence'] = confidence
		edges[edge_str]['directed'] = directed
	end
end

def add_node(nodes, terminals, anchor, gi, from, tissue)
	nodes[from]["type"] = 'terminal' if terminals.include?(from)
	nodes[from]["type"] = 'anchor' if anchor == from
	nodes[from]["name"] = from
	nodes[from]["expression"] = @expression[from][tissue] unless !@expression.include?(from) || @expression[from][tissue].nil? 
end

folder_name = ARGV.shift unless ARGV.empty?

@gi = GeneInfo.for_species(GeneInfo::Human)

terminals_filename = "#{folder_name}.terminals.txt"
terminals = Set.new
IO.foreach(terminals_filename) do |line| 
	name = line.chomp 
	name = @gi[name].default_id if @gi.include? name

	terminals << name
end

expression_filename = File.join(File.dirname(__FILE__), '../input/expression.diff.txt')
first = true;
@expression = Hash.new { |hash, key| hash[key] = {} }
titles = []
IO.foreach(expression_filename) do |line| 
	arr = line.chomp.split(/\t/)
	if first
		titles = arr[0...-1].map(&:downcase)
		first = false
	else 
		gene = arr.pop
		gene = @gi[gene].default_id if @gi.include? gene
		@expression[gene] = Hash[titles.zip(arr).select{|k,v| v != 'NA'}.map {|k,v| [k,v.to_f]}]
	end
end

scores = Hash.new {|h,k| h[k] = []}
Dir["#{folder_name}/*"].each do |filename|
	next if filename =~ /.msg$/ || filename =~ /.freq$/ || filename =~ /.xgmml$/

	anchor_name, control, tissue = File.basename(filename).split('.')
	anchor_symbol = anchor_name
	if tissue.nil?
		tissue = control
		control = "None"
	end

	anchor_name = @gi[anchor_name].entrez_id if @gi.include? anchor_name

#	score = scores[File.basename(filename)] = score_network(filename, anchor_name)
	anchor_scores = scores[anchor_name] << [anchor_symbol, control, network_expression(filename, anchor_name.to_s)]
	#anchor_scores = scores[anchor_name] << [anchor_symbol, control] + ScoringMethods.map { |m| self.send(m, filename, anchor_name.to_s) }

end	

puts ['anchor', 'control', 'expression'].join("\t")
scores.to_a.sort_by {|k,v| v}.each do |info|
	info[1].each do |item|	 
		puts item.join("\t")
	end
end	

