# to make a dataset with probabilistic ancestral reconstructions of gene presence/absence
# from GLOOME output
from collections import defaultdict
import sys
import re
import math

# call: python parse_reconstruction_082313.py GLOOME_run_13657254366576_etc/Ancestral_reconstuction_of_states_per_position_per_node.txt 

parent_maps = open('gainLoss_results/MOtree_GLrun/TheTree.INodes.ph.parentage').readlines()

parentage = {}
#nodes = []
#ancestors = []
for map in parent_maps:
	if map == '\n':
		continue
	fields = map.strip().split()
	parentage[fields[1]] = fields[0]
	
parentage['N1'] = 'NA'
#	ancestors.append(fields[1])

#nodes = parentage.keys()
#nodes.sort()
#ancestors.sort()
#print len(ancestors)
#ancestors = set(ancestors)

anc = open(sys.argv[1])

# sadly, i couldn't come up with anything better than a nested dict in 15 minutes
# to hold all the genome content info across species

# code for branches:
# 1 = absent both
# 2 = present both
# 3 = 'gain'
# 4 = 'loss'
# 5 = other
print sys.argv[1].split('/')

genomes = defaultdict(dict)
ancestors = defaultdict(dict)
positions = []	# really unique set of kos
for ko in anc:
	if ko.startswith('branch') or ko.startswith('POS'):
		continue
	else:
		fields = ko.split('\t')
	#	print fields[1]
		genomes[fields[0]][int(fields[1])] = fields[2].strip()
		positions.append(int(fields[1]))
		ancestors[parentage[fields[0]]][int(fields[1])] = fields[2].strip()		
anc.close()
#pos_maps = open('all_cicc_kos_w90A.meta').readlines()

pos_string = sys.argv[1].split('/')[1]	# a unique ID for the simulation replicate		
# print header of positions (kos)
poss = sorted(set(positions))
#for ko in poss:
#	print ko
#	print pos_maps[ko].strip()
allkos = [pos_string+'_'+str(ko) for ko in poss]
#allkos = [str(ko) for ko in poss]

# MODIFIED to grab states both ancestor and descendant nodes for each branch (labeling each branch 
# with the name of the descendant node)- to ultimately label each branch for each gene with
# a presence/absence label describing the various possible configurations of presence/absence (see above)

#print sorted(genomes.keys())
out_desc = open(sys.argv[1] + '.desc_profile','w')
out_anc = open(sys.argv[1] + '.anc_profile','w')
# now step through genomes and print out genome content matrix 'kolist'
# maybe also make a presence/absence profile by majority vote??
out_desc.write('\t'+'\t'.join(allkos)+'\n')
out_anc.write('\t'+'\t'.join(allkos)+'\n')
genomes['NA'] = 'NA'

print '\t'+'\t'.join(allkos)

# function to ascribe silly classification system
def classifier(anc,desc):
	#print anc,desc
	if anc == '1':
		if desc == '1':
			return '2'
		elif desc == '0':
			return '4'
		else:
			return '5'
	elif anc == '0':
		if desc == '1':
			return '3'
		elif desc == '0':
			return '1'
		else:
			return '5'
	else:
		return '5'

for org in sorted(genomes.keys()):
#	print org
#	print len(genomes[org])
	if org == 'NA' or parentage[org] =='NA':
		continue
#	print parentage[org]
#	print genomes[parentage[org]]
	genome = genomes[org]
	anc_genome = genomes[parentage[org]]
	phyl_prof = org
	anc_prof = org
	outstring = org
	for ko in sorted(genome.keys()):
	#	print classifier(anc_genome[ko],genome[ko])
#		print ko
		phyl_prof += '\t'+genome[ko]
		anc_prof += '\t'+anc_genome[ko]
		outstring += '\t'+classifier(anc_genome[ko],genome[ko])
	out_desc.write(phyl_prof+'\n')
	out_anc.write(anc_prof+'\n')
	print outstring
	
out_desc.close()
out_anc.close()
