'''
Parses output of Needle alignment of TWAS kmer and consensus sequence.
Outputs the aligned kmer position relative to the consensus to standard output. Start/stop positions are 0-based with inclusive start and exclusive stop positions (same as UCSC positions).
Also prints the alignment from Needle in fasta format to standard error. Sequence name will be the kmer followed by log10 p-value.
Usage: parse_needle_KmerOutput.py <Needle output file>
'''

import sys

if len(sys.argv) != 2:
	sys.exit(__doc__)

with open(sys.argv[1], 'r') as f:
	name = ""
	seq = ""
	score = 0
	for line in f:
		if "#" in line:
			continue
		if line.strip() == "":
			continue
		if "|" in line:
			continue
		if "consensus" in line:
			continue
		fields = line.rstrip().split()
		#Check to make sure the name of the kmer (log10 p-value) is the same
		if name == "": #if the first kmer line was reached, get the name
			name = fields[0]
		else:
			if name != fields[0]: #if not the first kmer line, check that the name for the line is same
				print('Something went wrong with parsing the Needle alignment. More than one sequence not labeled as consensus.')
				sys.exit()
		#Add the alignment sequences of the kmer on separate lines together
		seq += fields[2]

kmer = ""
positions = []
for i in range(len(seq)):
	if seq[i] == '-': #if the position is a gap, skip it
		continue
	positions.append(i)
	kmer += seq[i]

#Start and stop positions are 1-based
start = min(positions) - 1
stop = max(positions)

#Check to make sure that there are no gaps in the aligned kmer. If there are, report where they are.
difference = stop - start
count = len(positions)
gaps = []
if difference != count:
	for i in range(start+1, stop):
		if i not in positions:
			gaps.append(str(i))
gap_positions = ','.join(gaps)

#Write aligned kmer position relative to consensus to standard output
print(kmer + '\t' + name + '\t' + str(start) + '\t' + str(stop) + '\t' + gap_positions)
#Write Needle alignment in fasta format to standard error
print(">" + kmer + "_" + name + '\n' + seq, file=sys.stderr)

