#!/usr/bin/env ruby

require 'gene_info'
require 'returning'
require 'nokogiri'
require 'mean'
require 'set'

def total_network_size(filename, anchor_name)
	total = 0
	IO.foreach(filename) do |line| 
		confidence = line.chomp.split(/\s/)[2]
		total += -Math.log(confidence.to_f)
	end

	total
end

def read_network(filename)
	returning(Hash.new { |hash, key| hash[key] = {} }) do |network|
		IO.foreach(filename) do |line| 
			from, to, confidence = line.chomp.split(/\s+/).first(3)
			confidence = 1
			network[from][to] = confidence
			network[to][from] = confidence
		end
	end
end

def shortest_paths(from, network)
	returning(Hash.new(Float::INFINITY)) do |distances|
		distances[from] = 0
		queue = network.keys

		until queue.empty? 
			u = queue.min_by {|v| distances[v] }
			queue.delete(u)

			break if u == Float::INFINITY

			network[u].each do |v, d|
				alt = distances[u] + d
				distances[v] = alt if alt < distances[v] 
			end	
		end
	end
end

def max_distance_from_anchor(filename, anchor_name)
	network = read_network(filename)
	shortest_paths(anchor_name, network).values.max
end

def mean_distance_from_anchor(filename, anchor_name)
	network = read_network(filename)
	shortest_paths(anchor_name, network).values.mean
end

def num_nodes(filename, anchor_name)
	network = read_network(filename)
	network.delete_if {|k,v| !@gi.include?(k.to_i) } # remove terminals (these would have invalid gene names)
	network.size
end

def num_edges(filename, anchor_name)
	network = read_network(filename)
	network.delete_if {|k,v| !@gi.include?(k.to_i) } # remove terminals (these would have invalid gene names)
	#require 'debugger'; debugger
	network.map{|k,v| v.delete_if{|i| !@gi.include?(i.to_i)}; v.size}.reduce(:+) / 2	
end

def jaccard_nodes(filename, anchor_name)
	network = read_network(filename)	
	network.delete_if {|k,v| !@gi.include?(k.to_i) } # remove terminals (these would have invalid gene names)

	tissue = read_network(filename.split('.').values_at(0,-1).join('.'))
	tissue.delete_if {|k,v| !@gi.include?(k.to_i) } # remove terminals (these would have invalid gene names)

	s1 = Set.new(network.keys)
	s2 = Set.new(tissue.keys)

	(s1 & s2).size.to_f / (s1 | s2).size
end

def network_expression(filename, anchor_name)
	# require 'debugger'; debugger
	anchor_name = (@gi.include?(anchor_name.to_i) ? @gi[anchor_name.to_i].default_id : anchor_name)
	genes = Set.new
	IO.foreach(filename) do |line| 
		from, to = line.chomp.split(/\s/).first(2).map(&:to_i)
		next if from ==0 || to == 0
		genes << (@gi.include?(from) ? @gi[from].default_id : from)
		genes << (@gi.include?(to) ? @gi[to].default_id : to)
	end

	tissue = filename.match(/_([^_]+)$/)[1].downcase + 'diff'
	score = 0.0
	count = 0
	genes.each do |g|
		next if g == anchor_name
		if @expression.include?(g) && !@expression[g][tissue].nil?
			score += @expression[g][tissue].abs 
			count += 1
		end
	end	
	return score # / count
end

def kegg_enrichment(filename, anchor_name) 
	network = read_network(filename)	
	network.delete_if {|k,v| !@gi.include?(k.to_i) } # remove terminals (these would have invalid gene names)

	genes = network.keys

end

ScoringMethods = [:max_distance_from_anchor, :mean_distance_from_anchor, :num_nodes, :num_edges, :jaccard_nodes, :network_expression]

################################
def edge_name(from, to)
	(from.to_s < to.to_s) ? "#{from}_#{to}" : "#{to}_#{from}"
end

def add_edge(edges, from, to, confidence, directed)
	returning edge_name(from, to) do |edge_str|
		edges[edge_str]['source'] = from
		edges[edge_str]['target'] = to
		edges[edge_str]['confidence'] = confidence
		edges[edge_str]['directed'] = directed
	end
end

def add_node(nodes, terminals, anchor, gi, from, tissue)
	nodes[from]["type"] = 'terminal' if terminals.include?(from)
	nodes[from]["type"] = 'anchor' if anchor == from
	nodes[from]["name"] = from
	nodes[from]["expression"] = @expression[from][tissue] unless !@expression.include?(from) || @expression[from][tissue].nil? 
end

def anat_to_xgmml(filename, anchor, terminals, gi, score, pubmed)
	nodes = Hash.new { |hash, key| hash[key] = {} }
	edges = Hash.new { |hash, key| hash[key] = {} }

	tissue = filename.match(/_(\w+)$/)[1].downcase + 'diff'

	IO.foreach(filename) do |line| 
		from, to, confidence, direction = line.chomp.split(/\s/).values_at(0,1,2,-1)

		if pubmed.include?(from) && pubmed[from].include?(to)
			edge_pubmeds = pubmed[from][to] 
		else 
			edge_pubmeds = nil
		end
		from = from.to_i if from =~ /^\d+$/
		from = gi[from].default_id if gi.include? from

		to = to.to_i if to =~ /^\d+$/
		to = gi[to].default_id if gi.include? to

		add_node(nodes, terminals, anchor, gi, from, tissue) # unless source == 'ANCHOR'
		add_node(nodes, terminals, anchor, gi, to, tissue) # unless target == 'ANCHOR'

		edge_id = add_edge(edges, from, to, confidence, direction == '1')
		edges[edge_id]['pubmeds'] = edge_pubmeds unless edge_pubmeds.nil?
	end

	Nokogiri::XML::Builder.new do |xml|
		label = File.basename(filename)
		xml.graph('xmlns' => 'http://www.cs.rpi.edu/XGMML', 'label' => label, 'directed' => '0') {
			xml.att :name => 'score', :type => 'real', :value => score
			nodes.each do |id, attr|
				xml.node(:id => id, :label => id) {
					xml.att :name => 'type', :type => 'string', :value => attr['type']
					xml.att(:name => 'name', :type => 'string', :value => attr['name']) if attr.include? 'name'
					xml.att(:name => 'expression', :type => 'real', :value => attr['expression']) if attr.include? 'expression'
				}
			end	

			edges.each do |id, attr|
				source = attr.delete("source")
				target = attr.delete("target")
				xml.edge(:source => source,
						 :target => target,
						 :label => "#{source} (pp) #{target}",
						 :id => "#{source} (pp) #{target}") {
					xml.att(:name => 'pubmeds',
							:type => 'string', 
							:value => attr['pubmeds'] ) if attr.include? 'pubmeds'
					xml.att(:name => 'confidence',
							:type => 'real', 
							:value => attr['confidence'])
					if attr['directed']
						xml.graphics :fill => "#000000", :width => (attr['confidence'].to_f * 10).to_i, :arrow => 'last' 
						xml.att(:name => 'interaction',
							:type => 'string', 
							:value => 'pd') 
					else
						xml.graphics :fill => "#000000", :width => (attr['confidence'].to_f * 10).to_i
						xml.att(:name => 'interaction',
							:type => 'string', 
							:value => 'pp') 
					end
				}
			end
		}
	end.doc
end

folder_name = ARGV.shift unless ARGV.empty?

@gi = GeneInfo.for_species(GeneInfo::Human)

terminals_filename = "#{folder_name}.terminals.txt"
terminals = Set.new
IO.foreach(terminals_filename) do |line| 
	name = line.chomp 
	name = @gi[name].default_id if @gi.include? name

	terminals << name
end

expression_filename = File.join(File.dirname(__FILE__), '../input/expression.diff.txt')
first = true;
@expression = Hash.new { |hash, key| hash[key] = {} }
titles = []
IO.foreach(expression_filename) do |line| 
	arr = line.chomp.split(/\t/)
	if first
		titles = arr[0...-1].map(&:downcase)
		first = false
	else 
		gene = arr.pop
		gene = @gi[gene].default_id if @gi.include? gene
		@expression[gene] = Hash[titles.zip(arr).select{|k,v| v != 'NA'}.map {|k,v| [k,v.to_f]}]
	end
end

pubmed_filename = File.join(File.dirname(__FILE__), '../input/H_sapiens-ppi-pdna.eat')
pubmed = Hash.new { |hash, key| hash[key] = {} }
IO.foreach(pubmed_filename) do |line| 
	from, to, ids = line.chomp.split(/\t/)
	pubmed[from][to] = ids
	pubmed[to][from] = ids
end

scores = Hash.new {|h,k| h[k] = []}
Dir["#{folder_name}/*"].each do |filename|
	next if filename =~ /.msg$/ || filename =~ /.freq$/ || filename =~ /.xgmml$/

	anchor_name, control, tissue = File.basename(filename).split('.')
	anchor_symbol = anchor_name
	if tissue.nil?
		tissue = control
		control = "None"
	end

	anchor_name = @gi[anchor_name].entrez_id if @gi.include? anchor_name

	anchor_scores = scores[anchor_name] << [anchor_symbol, control] + ScoringMethods.map { |m| self.send(m, filename, anchor_name.to_s) }

	File.open(filename + '.xgmml', 'w') do |f| 
		anat_to_xgmml(filename, @gi[anchor_name].default_id, terminals, @gi, 0, pubmed).write_xml_to(f)
	end unless File.exists? filename + '.xgmml'
end	

puts [['anchor', 'control'] + ScoringMethods].join("\t")
scores.to_a.sort_by {|k,v| v}.each do |info|
	info[1].each do |item|	 
		puts item.join("\t")
	end
end	

