#changes As to Gs in input fastq files and makes an index of the changes
# Written by Stephen Tran (@ Grace Xiao's lab, UCLA)

import sys
import re
from itertools import islice

if (len(sys.argv) != 3):
	sys.stderr.write( "\t--help : requires arguments <input_fastq_file> <output_fastq_A_to_G_file>", \
	"\n\tinput_fastq_file : name of input fastq file",\
	"\n\toutput_fastq_A_to_G_file : name of fastq file that will store per read positions of As converted to Gs\n" )
	sys.exit(1)

input_fastq_file = sys.argv[1]
output_fastq_A_to_G_file = sys.argv[2]

#open the input fastq file and output fastq file and output index file
try:
	input_fastq_fh = open(input_fastq_file,'r')
except IOError:
	sys.stderr.write( "cannot open {0}\n".format(input_fastq_file))
	sys.exit(1)

try:
	output_fastq_fh = open(output_fastq_A_to_G_file,'w')
except IOError:
	sys.stderr.write( "cannot open "+output_fastq_A_to_G_file+'\n')
	sys.exit(1)

sys.stderr.write( "reading in fastq file and converting As to Gs\n")
number_lines_to_read_in = 10000000 #lets read in and process 10 million lines at a time
counter = 0

#for line in input_fastq_fh:
while True:
	next_n_lines = list(islice(input_fastq_fh,number_lines_to_read_in))
	if not next_n_lines:
		break #this is how I'll know when EOF
	counter += number_lines_to_read_in
	if counter % 1000000 == 0:
		sys.stderr.write( str(counter)+'\n')
	index_list = list() #will hold the buffered index
	fastq_list = list() #will hold the buffered modified fastq
	#now process every 4 lines
	for line_counter in range(0,len(next_n_lines),4):
		line = next_n_lines[line_counter]
		line2 = next_n_lines[line_counter+1]
		line3 = next_n_lines[line_counter+2]
		line4 = next_n_lines[line_counter+3]
		positions_changed_A_to_G = list()
		for i in range(0,len(line2)):
			nucleotide = line2[i]
			if nucleotide == "A":
				positions_changed_A_to_G.append(i)
		line2 = re.sub(r'A',r'G',line2)	
		fastq_list.append(line+line2+line3+line4)
		#next write to the index in the buffer
		readname = line.rstrip().lstrip()
		positions_changed_A_to_G = [str(i) for i in positions_changed_A_to_G]
		index_list.append(readname+"\t"+",".join(positions_changed_A_to_G)+'\n')
		#lastly iterate to the next 4 lines
	output_fastq_fh.write("".join(fastq_list))
	sys.stdout.write("".join(index_list))


input_fastq_fh.close()
output_fastq_fh.close()
sys.stderr.write ("job completed converting fastq As to Gs\n")

