'''
Calculates the average percent match for sliding windows across a list of elements.
Does not include windows where less than 50% of the windows size is match or mismatch (i.e. gap or degenerate base).
Requires the input aligned fasta (.fa) file to have the same length (same number of windows).
Consensus stop is non-inclusive (i.e. start 0, stop 20 is window from base0 to base19).
***This script version only outputs significantly higher match (conservation) windows. Background is all other windows mean match percentage.***
Usage: calc_slidingWindow_byList.py <input aligned .fa file> <element list file> <sliding window size> <output file>
'''

import sys
import statistics
from scipy import stats

fdr = 0.05 #Change as necessary

if len(sys.argv) != 5:
	sys.exit(__doc__)

try:
	window_size = int(sys.argv[3])
except TypeError:
	print('<sliding window size> must be an integer greater than 0')
	sys.exit(__doc__)

list_elements = []
with open(sys.argv[2], 'r') as f:
	for line in f:
		name = line.lstrip('>').rstrip()
		list_elements.append(name)

di = {}
with open(sys.argv[1], 'r') as f:
	for line in f:
		if line.startswith('>'):
			name = line.lstrip('>').rstrip('\n')
			seq = f.readline().rstrip('\n')
			di[name] = []
			for i in range(len(seq)-window_size):
				num_match = 0
				total = 0
				for j in range(window_size):
					if seq[i+j] == 'M': #match
						num_match += 1
						total += 1
					elif seq[i+j] == 'X': #mismatch
						total += 1
					elif seq[i+j] == '-': #gap
						continue
					elif seq[i+j] == 'D': #degenerate base
						continue
				if total > (window_size/2):
					window_score = num_match/total
					di[name].append(window_score)
				else:
					di[name].append('NA')

num_windows = len(di[name]) #Get number of windows for last element in fasta file, should be the same for all

window_scores = []
for i in range(num_windows):
	window_scores.append([])
for name in list_elements:
	if len(di[name]) != num_windows:
		print(name + ' does not have the same number of windows as others')
		sys.exit(__doc__)
	for i in range(num_windows):
		if di[name][i] == 'NA':
			continue
		else:
			window_scores[i].append(di[name][i])

window_means = []
for i in range(num_windows):
	window_means.append(statistics.mean(window_scores[i]))

sigWindows = []
di_sig = {}
for i in range(num_windows):
	tstat, pval = stats.ttest_ind(window_scores[i], window_means)
	if pval < (fdr/num_windows) and tstat > 0: #Bonferroni correction for p-value, make sure t-statistic is positive for higher match percentage only
		window = 'window' + str(i)
		mean = statistics.mean(window_scores[i])
		num_elements = len(window_scores[i])
		sigWindows.append(window)
		di_sig[window] = [str(i), str(i+window_size), str(mean), str(num_elements), str(pval)]

with open(sys.argv[-1], 'w') as o:
	o.write('window#\tconsensus_start\tconsensus_stop\tWindow_match%\tNum_elements\tp-value\n')
	for window in sigWindows:
		o.write(window + '\t' + '\t'.join(di_sig[window]) + '\n')

