'''
Extracts out the sequence aligning to a given length of the consensus sequence (does not include gaps when counting positions of consensus).
Each "fasta" entry must contain the sequence of interest followed by the aligned consensus in the next line. The consensus for all "fasta" entries must be the same ("standardized").
Does NOT remove gaps in the output to conserve alignments and makes all nucleotide letters into uppercase.
Usage: python3 extract_needleAlignedSequences_givenConsensusLength_standardConsensus.py <input needleAlign "fasta" file> <Length of consensus to extract> <output file>
'''

import sys

if len(sys.argv) != 4:
	sys.exit(__doc__)

try:
	extract_length = int(sys.argv[2])
except TypeError:
	print('<Length of consensus to extract> must be integer')
	sys.exit(__doc__)

with open(sys.argv[3], 'w') as o:
	with open(sys.argv[1], 'r') as f:
		for line in f:
			if '>' in line:
				name = line.rstrip('\n')
				seq = f.readline().rstrip('\n')
				consensus = f.readline().rstrip('\n')
				position = 0
				for i in range(len(consensus)):
					if consensus[i] == '-':
						continue
					else:
						position += 1
						if position == extract_length:
							o.write(name + '\n')
							o.write(seq[:i+1].upper() + '\n')
							break

