# script to process some phylogenetic profiles into an MSA format as input to GLOOME
# (from pupko) to infer gain/loss events- could save me some time and energy.

import sys
import re

# list of kos of interest
ko_list = open(sys.argv[1])
kos = [ko.strip() for ko in ko_list]
ko_list.close()

print 'there are ',len(kos),' kos of interest'

# table of phyletic profiles
phyl_prof = open(sys.argv[2])

# list of species of interest
specs = open(sys.argv[3])
spec_list = [spec.strip() for spec in specs]
specs.close()

# get indices for kos of interest
header = phyl_prof.readline().strip().split(' ')
ko_indices = [header.index(ko)+1 for ko in header if ko in kos]
#for ko in header:
 #   if ko in kos:
  #      print ko

print len(ko_indices), 'ko indices'

# write out kos for metadata, to let user know which kos are which
metadata=open(sys.argv[4]+'.meta','w')
kos.sort()
metadata.write('\n'.join(kos))

outfile = open(sys.argv[4],'w')

def binarize(x):
    y = int(x)
    if y > 0:
        return '1'
    else:
        return '0'

#now pound through profiles and grab species w/ kos of interest.
spec_phy = phyl_prof.readline()
while spec_phy != '':
	species = spec_phy[0:3]
    #print species
	if species in spec_list:
		phyl = spec_phy.split(' ')
		to_add = '\n>'+species+'\n'
		prof = [binarize(phyl[ind]) for ind in ko_indices]
    #	print len(prof),' kos in profile'
		to_add += ''.join(prof)
    #	print len(to_add)
		outfile.write(to_add)
	spec_phy = phyl_prof.readline()

phyl_prof.close()
