#returns sam files that were mapped to reverse complement genome back to their normal coordinates. Also flips the strands from + to -

# Written by Stephen Tran (@ Grace Xiao's lab, UCLA)


import sys
import re
from itertools import islice

if (len(sys.argv) != 2):
	sys.stderr.write( "--help : requires arguments <fai_file>"+ \
	"\nfai_file : fai genome file\n")
	sys.exit(1)

fai_file = sys.argv[1]

#functions
def ReverseComplement1(seq):
	seq_dict = {'A':'T','T':'A','G':'C','C':'G','N':'N','.':'.'}
	return "".join([seq_dict[base] for base in reversed(seq)])

def reverse_sam_flag(sam_flag):
	number_expression = re.compile(r'(\d+)')
	letter_expression = re.compile(r'(\D+)')
	if not number_expression.findall(sam_flag):
		sys.stderr.write( 'cannot parse CIGAR string for numbers '+sam_flag+'\n')
		sys.exit(1)
	if not letter_expression.findall(sam_flag):
		sys.stderr.write( 'cannot parse CIGAR string for letters '+sam_flag+'\n')
		sys.exit(1)
	numbers = number_expression.findall(sam_flag)
	letters = letter_expression.findall(sam_flag)
	numbers = numbers[::-1] #flip over the number list
	letters = letters[::-1] #flip over the letter list
	#make new CIGAR string
	new_CIGAR_string = ''
	for i in range(0,len(numbers)):
		new_CIGAR_string += str(numbers[i])+letters[i]
	return new_CIGAR_string

def flip_quality_scores(quality_scores):
	return quality_scores[::-1]

#change old position on reverse complement genome to new position on the normal genome
def get_new_position(old_position,cigar_string,len_chr): 
	old_position = int(old_position)
	number_expression = re.compile(r'(\d+)')
	letter_expression = re.compile(r'(\D+)')
	if not number_expression.findall(cigar_string):
		sys.stderr.write( 'cannot parse CIGAR string for numbers '+cigar_string+'\n')
		sys.exit(1)
	if not letter_expression.findall(cigar_string):
		sys.stderr.write( 'cannot parse CIGAR string for letters '+cigar_string+'\n')
		sys.exit(1)
	CIGAR_numbers = number_expression.findall(cigar_string)
	CIGAR_letters = letter_expression.findall(cigar_string)
	#now adjust the old position based on the CIGAR string
	#my code does not work for I, D, and S. But it doesn't matter because out editing calling pipeline doesn't look at reads containing these characters
	for i in range(0,len(CIGAR_letters)):
		letter = CIGAR_letters[i]
		number = int(CIGAR_numbers[i])
		if letter == 'M' or letter == 'X' or letter == '=':
			old_position += number
		elif letter == 'I': #the SEQ sequence has a nucleotide that the reference does not
			pass #b/c there is no mapping of the read to reference here
		elif letter == 'D': #the reference has a nucleotide that our SEQ sequence does not have
			old_position += number
		elif letter == 'S': #the SEQ has the original sequence, but we are masking it from the genome
			pass #b/c this is not included in the mapping
		elif letter == "H": #the SEQ has the original sequence removed (SEQ is smaller than the normal like 50 nucleotides)
			pass #b/c hard clips are definitely not included in the mapping
		elif letter == 'P': #padding is for de novo aligners. It is not used in RNA seq. It is literally alignment of nothing to nothing. So you can always ignore P
			pass #b/c padding is not at all included in mapping
		elif letter == 'N': #the SEQ is skipping the reference this N nucleotides
			old_position += number
		else:
			sys.stderr.write( 'cannot figure out CIGAR letter '+cigar_string+'\n')
			sys.exit(1)
	old_position -= 1 #to that old_position = position of last nucleotide aligned to reverse genome
	new_position = int(len_chr) - int(old_position)+1 #for eample if reference is 70 nuleotides an you have read aligning 50 matches starting at position 1, then 70 -50 =20. So updated position is 21 (add 1 to 20).
	return new_position


try:
	fai_fh = open(fai_file,'r')
except IOError:
	sys.stderr.write( 'cannot open '+fai_file+'\n')
	sys.exit(1)

#first get the lengths of each chromsome
chromosome_lengths = dict() #chromosome_lengths->[chr]->length
for line in fai_fh:
	line_info = line.split('\t')
	chromosome_lengths[line_info[0]] = line_info[1]

counter= 0 #keep track of progress
number_lines_to_read_in = 1000000 #lets read in and process 1 million lines at a time
while True:
#for line in sys.stdin:
	next_n_lines = list(islice(sys.stdin,number_lines_to_read_in))
	if not next_n_lines:
		break #this is how I'll know when EOF
	sam_list = list() #to hold the buffered sam files	
	for line in next_n_lines:
		counter +=1
		if counter % 1000000 == 0:
			sys.stderr.write( str(counter)+'\n')
		#sys.stderr.write( line #debugging
		line_info = line.split('\t')
		strand = line_info[1]
		bit_strand = format(int(strand),'#014b')[2:] 
		new_bit_strand = list(bit_strand)
		#first fix the strand (convert all + to - strand)
		if bit_strand[-5] == "0": #+ strand
			new_bit_strand[-5] = "1"  	
		elif bit_strand[-5] == "1": #minus strand
			sys.stderr.write( 'please perform remove_minus_strand_mappings.py step\n')
			sys.stderr.write( 'a read is from minus strand\n')
			sys.stderr.write( line+'\n')
			sys.exit(1)
		else:
			sys.stderr.write( "cannot determine strand of this read\n"+line)
			sys.exit(1)
		#replace old sam flag with new sam flag
		new_bit_strand = "".join(new_bit_strand)  
		new_strand = int(new_bit_strand,2)
		line_info[1] = str(new_strand)
		#next fix the position of the read
		chromosome = line_info[2]
		position = line_info[3]
		length_chr = chromosome_lengths[chromosome]
		old_CIGAR_string = line_info[5]
		old_SEQ = line_info[9]
		old_quality_scores = line_info[10]
		#new_position = int(length_chr) - int(position ) + 1
		new_position = get_new_position(position,old_CIGAR_string,length_chr)
		line_info[3] = str(new_position)
		#fix the CIGAR string (flip it over)
		newCIGARString = reverse_sam_flag(old_CIGAR_string)
		line_info[5] = newCIGARString
		#flip over the quality scores
		new_quality_scores = flip_quality_scores(old_quality_scores)
		line_info[10] = new_quality_scores
		#get reverse complement of the sequence
		reversed_complement_sequence = ReverseComplement1(old_SEQ)
		line_info[9] = reversed_complement_sequence
		#output the fixed read
		line = "\t".join(line_info)
		#sys.stdout.write(line)
		sam_list.append(line)
	sys.stdout.write("".join(sam_list))
	
fai_fh.close()
sys.stderr.write( 'job completed fixing reverse complement sam file\n')
