'''
Gets transition/transversion rates from called transitions/transversions file.
Separates sequences based on species in fasta name. Species name follows ";" in the fasta name.
Usage: python3 get_transition_transversion_rates_species.py <input file> <output file>
'''

import sys

if len(sys.argv) != 3:
	sys.exit(__doc__)

num_transitions = []
num_transversions = []
total = []
species_list = []
with open(sys.argv[1], 'r') as f:
	for line in f:
		if line.startswith('>'):
			name = line.strip()
			species = name.split(';')[-1]
			if species not in species_list:
				species_list.append(species)
				num_transitions.append(0)
				num_transversions.append(0)
				total.append(0)
			species_index = species_list.index(species)
			seq = f.readline().rstrip()
			for i in range(len(seq)):
				if seq[i] == '-' or seq[i] == 'D': #Skip if gap or degenerate base
					continue
				elif seq[i] == 'M': #Match
					total[species_index] += 1
				elif seq[i] == '1': #Transition
					num_transitions[species_index] += 1
					total[species_index] += 1
				elif seq[i] == '2': #Transversion
					num_transversions[species_index] += 1
					total[species_index] += 1

with open(sys.argv[-1], 'w') as o:
	o.write('Species\tType\tRate\tNumber\n')
	for i in range(len(species_list)):
		species = species_list[i]
		num_conserved = total[i] - num_transitions[i] - num_transversions[i]
		rate_conserved = num_conserved/total[i]
		o.write(species + '\tConserved\t' + str(rate_conserved) + '\t' + str(num_conserved) + '\n')
		rate_transitions = num_transitions[i]/total[i]
		o.write(species + '\tTransitions\t' + str(rate_transitions) + '\t' + str(num_transitions[i]) + '\n')
		rate_transversions = num_transversions[i]/total[i]
		o.write(species + '\tTransversions\t' + str(rate_transversions) + '\t' + str(num_transversions[i]) + '\n')

