'''
Calculates average genome divergence rate from pairwise alignment file in axt format.
Only uses masked (lower case) base pairs.
Usage: python3 calc_divergence_from_pairwiseAlign_maskedOnly.py <input file> <output file>
'''

import sys
import gzip

num_match = 0
num_mismatch = 0
with gzip.open(sys.argv[1], 'rb') as f:
	for line in f:
		within_info = line.decode('utf-8').split('{')[1].split('}')[0].split(',')
		target_seq = within_info[4].split(':')[1].strip("\"")
		query_seq = within_info[5].split(':')[1].strip("\"")
		for i in range(len(target_seq)):
			if target_seq[i] != '-' and query_seq[i] != '-': #Only evaluate if not a gap
				if target_seq[i].upper() == 'N' or query_seq[i].upper() == 'N': #Skip if it's an N nucleotide
					continue
				if query_seq[i].isupper(): #Skip unmasked bases
					continue
				if target_seq[i].upper() == query_seq[i].upper():
					num_match += 1
				else:
					num_mismatch += 1

with open(sys.argv[-1], 'w') as o:
	o.write('Input pairwise alignment file: ' + sys.argv[1] + '\n')
	o.write('Average genome divergence (masked only) = ' + str(num_mismatch/(num_match+num_mismatch)) + '\n')

