#!/usr/env python
"""
Given one or more ranked lists, computes average percentile rank for union of items.
Update 2/23: following that, we RECOMPUTE percentile ranks over that result.
We also print out another column that indicates whether something has a score over 95.

Input file format:
name    score

Higher scores = top of the list.

Usage:
python combine_ranks.py outfile outColumnName infile1 [... infileN]
"""
import sys, csv, scipy.stats

def main(argv):
	if len(argv)<4:
		print "USAGE: python combine_ranks.py outfile outColumnName infile1 [... fileN]"
		return 2

	outFn=argv[1]
	outColName=argv[2]

	rankings={}
	allgenes=set()
	for inFn in argv[3:]:
		scores=read_infile(inFn)
		myrank=compute_percrank(scores)
		rankings[inFn]=myrank
		allgenes=set.union(allgenes, set(myrank.keys()))
		print "Read scores for %d genes from %s" % (len(scores), inFn)
	print "Read scores for total of %d genes from %d files." % (len(allgenes), len(argv[3:]))

	# compute average per gene. for genes that appear in only one list, we only use the one value.
	avgs={}
	counts={}
	for g in list(allgenes):
		vals=[]
		for myrank in rankings.values():
			if g in myrank:
				vals.append(myrank[g])
		if len(vals)==0:
			print "No values?? %s" % g
		meanval=sum(vals)/float(len(vals))
		counts[g]=len(vals) 
		avgs[g]=meanval

	# recompute the percrank on the averages
	avgs=compute_percrank(avgs)

	# now print the results from highest to lowest.
	sortavg=sorted(avgs.items(), key=lambda x:-1*x[1])

	with open(outFn,'w') as outf:
		print >> outf, "Gene\t%s_AvgPercRank\t%s_InTop5" % (outColName, outColName)
		for (g,val) in sortavg:
			top5=""
			if val>=95:
				top5="top5"
			print >> outf, "%s\t%f\t%s" % (g,val,top5)

	print "Wrote %d ranks to %s." % (len(sortavg), outFn)
	return

def compute_percrank(scores):
	""" Given score map, converts to percentile ranks."""
	allscores=scores.values()
	ranking={}
	for (g,s) in scores.items():
		myrank=scipy.stats.percentileofscore(allscores, s, 'rank')
		ranking[g]=myrank
	print "Converted %s scores to %d percentile ranks." % (len(scores),len(ranking))
	return ranking

def read_infile(inFn):
	""" Reads scores into map {gene:score}"""
	scores={}
	with open(inFn,'r') as f:
		reader=csv.reader(f,delimiter="\t")
		for row in reader:
			gene=row[0]
			score=0
			# Get score. If no score, it's probably the header line.
			try:
				score=float(row[1])
			except ValueError:
				continue 
			if gene in scores:
				print >> sys.stderr, "Duplicate gene: %s %f, was %f" % (gene, score, scores[gene])
			scores[gene]=score
	return scores


if __name__=="__main__":
	sys.exit(main(sys.argv))
