'''
Calls matches/mismatches/gaps at each position compared to consensus based on alignment file.
Differentiates transition and transversion mismatches.
Only considers consensus positions.
Counts degenerate bases (for LTR18A M, W, K, N bases) as gaps.
Usage: python3 call_differences_fromConsensus_needleAlign.py <full consensus alignment file> <output file>
'''

import sys

if len(sys.argv) != 3:
	sys.exit(__doc__)

di = {}
names = []
with open(sys.argv[1], 'r') as f:
	for line in f:
		if line.startswith(">"):
			name = line.rstrip('\n').lstrip('>')
			seq = f.readline().rstrip('\n').upper()
			consensus = f.readline().rstrip('\n').upper()
			if len(seq) != len(consensus): #check to make sure
				print(name + ' sequence and consensus do not have the same length')
				sys.exit(__doc__)
			seq_match = ''
			for i in range(len(seq)):
				seq_nuc = seq[i]
				consensus_nuc = consensus[i]
				if consensus_nuc == '-':
					continue
				if seq_nuc == '-':
					seq_match += '-'
					continue
				if consensus_nuc == 'M' or consensus_nuc == 'W' or consensus_nuc == 'K' or consensus_nuc == 'N': #Skip nucleotide is consensus is degenerate base
					seq_match += 'D'
					continue
				else:
					if seq_nuc == consensus_nuc:
						seq_match += 'M'
					else: #mismatch, see if it's transition or transversion
						if consensus_nuc == 'A' and seq_nuc == 'G':
							seq_match += '1' #Transition, type1
						elif consensus_nuc == 'G' and seq_nuc == 'A':
							seq_match += '1' #Transition, type1
						elif consensus_nuc == 'C' and seq_nuc == 'T':
							seq_match += '1' #Transition, type1
						elif consensus_nuc == 'T' and seq_nuc == 'C':
							seq_match += '1' #Transition, type1
						else: #If not any of the above, then must be transversion
							seq_match += '2' #Transversion, type2
			di[name] = seq_match
			names.append(name)

with open(sys.argv[2], 'w') as o:
	for name in names:
		o.write('>' + name + '\n')
		o.write(di[name] + '\n')

