'''
Calculates the probability that a motif is conserved given transition/transversion rates and a known ancestral sequence.
Requires the following arguments:
	Matching motifs file: File from FIMO that lists all significant matches of kmers of motif length
	Transition rate: Value between 0 and 1 for frequency of sites with transitional differences
	Transversion rate: Value between 0 and 1 for frequency of sites with transversional differences
	Ancestral seq : Sequence of the ancestral motif
Usage: python3 calc_neutral_expectation_motif_conservation_knownAnc.py <Matching motifs file> <Transition rate> <Transversion rate> <Ancestral seq> <output file>
'''

import sys

if len(sys.argv) != 6:
	sys.exit(__doc__)

try:
	transition_rate = float(sys.argv[2])
	transversion_rate = float(sys.argv[3])
	anc_seq = sys.argv[4].upper()
except TypeError:
	sys.exit(__doc__)

conserved_rate = 1 - transition_rate - transversion_rate

def reverse_complement(seq):
	comp = {'A':'T', 'T':'A', 'C':'G', 'G':'C'}
	reverse_seq = seq[::-1]
	rev_comp = ''
	for base in reverse_seq:
		rev_comp += comp[base]
	return rev_comp

total_probability = 0
num_motifs = 0
with open(sys.argv[1], 'r') as f:
	header = f.readline()
	for line in f:
		if line.startswith('#'):
			continue
		elif line.strip() == '':
			continue
		fields = line.rstrip('\n').split('\t')
		motif = fields[0]
		strand = fields[5]
		if strand == '+':
			seq = fields[9].upper()
		elif strand == '-':
			seq = reverse_complement(fields[9].upper())
		else:
			sys.exit(line + ' strand field is not + or -')
		num_transitions = 0
		num_transversions = 0
		num_conserved = 0
		for i in range(len(seq)):
			if seq[i] == anc_seq[i]:
				num_conserved += 1
			elif anc_seq[i] == 'A' and seq[i] == 'G':
				num_transitions += 1
			elif anc_seq[i] == 'G' and seq[i] == 'A':
				num_transitions += 1
			elif anc_seq[i] == 'C' and seq[i] == 'T':
				num_transitions += 1
			elif anc_seq[i] == 'T' and seq[i] == 'C':
				num_transitions += 1
			else:
				num_transversions += 1
		probability = conserved_rate**num_conserved * transition_rate**num_transitions * transversion_rate**num_transversions
		total_probability += probability
		num_motifs += 1

with open(sys.argv[-1], 'w') as o:
	o.write('Neutral expectation for motif conservation\n\nProb(conserved): ' + str(total_probability) + '\n\n')
	o.write('Motif: ' + motif + '\n')
	o.write('Ancestral sequence: ' + anc_seq + '\n')
	o.write('Transition rate: ' + str(transition_rate) + '\n')
	o.write('Transversion rate: ' + str(transversion_rate) + '\n')
	o.write('Command: python3 ' + ' '.join(sys.argv) + '\n')
	

