'''
Gets the underlying sequence below the peaks from the output of find_SHARPR_peaks_above_cutoff.py if the peaks are longer than a given length requirement.
Combines peaks that are within 20bp of each other.
Usage: python3 get_underlying_seq_under_peak.py <input peaks file> <full length elements sequence file> <peak length requirement> <output file>
'''

import sys

if len(sys.argv) != 5:
	sys.exit(__doc__)

try:
	cutoff = int(sys.argv[3])
except TypeError:
	print('Peak length requirement needs to be an integer >= 0')
	sys.exit(__doc__)

di = {}
with open(sys.argv[2], 'r') as f:
	for line in f:
		if ">" in line:
			name = line.lstrip('>element_').rstrip('\n')
			seq = f.readline().rstrip('\n')
			di[name] = seq

peak_overlap = 20
order = []
di_peaks = {}
with open(sys.argv[1], 'r') as f:
	for line in f:
		fields = line.rstrip('\n').split('\t')
		name = fields[0]
		start = int(fields[1])
		stop = int(fields[2])
		if name not in di_peaks:
			di_peaks[name] = []
			di_peaks[name].append([start, stop])
			order.append(name)
		else:
			last_peak = di_peaks[name][-1]
			last_stop = last_peak[1]
			if start <= (last_stop + peak_overlap):
				new_peak = [last_peak[0], stop]
				di_peaks[name][-1] = new_peak
			else:
				di_peaks[name].append([start, stop])

with open(sys.argv[4], 'w') as o:
	for name in order:
		peaks = di_peaks[name]
		for peak in peaks:
			start = peak[0]
			stop = peak[1]
			difference = stop - start
			if difference <= cutoff: #if the peak region is smaller than the required peak length, skip it
				continue
			seq = di[name]
			peak_seq = seq[start:stop]
			o.write(name + '\t' + peak_seq + '\n')


