#combines the R and RC_R files into agglomerate files
# Written by Stephen Tran (@ Grace Xiao's lab, UCLA)


import sys

if len(sys.argv) != 4:
	sys.stderr.write( '--help : arguments are <input R file> and <input RC file> <output_agglomerate_file>'\
	'\t\noutput_agglomerate_file : the combined R and RC file\n')
	sys.exit(1)

input_R_file = sys.argv[1]
input_RC_file = sys.argv[2]
output_agglomerate_file = sys.argv[3]

#try opening the files
try:
	input_R_fh = open(input_R_file,'r')
except IOError:
	sys.stderr.write( "cannot open"+input_R_file+'\n')
	sys.exit(1)
try:
	input_RC_fh = open(input_RC_file,'r')
except IOError:
	sys.stderr.write( "cannot open"+input_RC_file+'\n')
	sys.exit(1)
try:
	output_agglomerate_fh = open(output_agglomerate_file,'w')
except IOError:
	sys.stderr.write( "cannot open"+output_agglomerate_file+'\n')
	sys.exit(1)

def get_all_same_reads(readname,fh,line):
#returns [lines,next_line ] where lines is list of all lines of readmappings of the readname. next_line is next read where readnameis the next readname.
#current_readname is the readname of line (not of lines)
#line is the line containing the readname
	lines = list()
	lines.append(line)
	current_readname = readname
	while(current_readname == readname and current_readname != ''):
		line = fh.readline()
		line_info = line.split('\t')	
		current_readname = line_info[0]
		if current_readname == readname:
			lines.append(line)
		else:
			return [lines,line,current_readname]

sys.stderr.write( "starting job\n")
still_going_through_files=True
R_line = input_R_fh.readline()
RC_line = input_RC_fh.readline()
R_readname = R_line.split('\t')[0]
RC_readname = RC_line.split('\t')[0]
get_next_set_R_reads = True
get_next_set_RC_reads = True
counter = 0
while still_going_through_files:
	counter += 1
	if counter % 10000000 == 0:
		sys.stderr.write( str(counter)+'\n')
	#def get_all_same_reads(readname,fh):
	#input_R_fh
	#input_RC_fh
	#output_agglomerate_fh
#	R_line = input_R_fh.readline()
#	RC_line = input_RC_fh.readline()
	#if R_line == '' and RC_line == '':
	if R_readname == '' and RC_readname == '':
		still_going_through_files = False
		continue
	#elif R_line == '':  #if either R_line or RC_line=='' then just sys.stderr.write( out the remaining lines in the other file
	elif R_readname == '':
		line = RC_line
		while line != '':
			output_agglomerate_fh.write(line)
			line = input_RC_fh.readline()
		still_going_through_files = False
		continue
	#elif RC_line == '':
	elif RC_readname == '':
		line = R_line
		while line != '':
			output_agglomerate_fh.write(line)
			line = input_R_fh.readline()
		still_going_through_files = False
		continue
	if get_next_set_R_reads:
		R_readname = R_line.split('\t')[0]
		if R_readname == '':
			get_next_set_R_reads = False
			continue
		R_info = get_all_same_reads(R_readname,input_R_fh,R_line)
	if get_next_set_RC_reads:
		RC_readname = RC_line.split('\t')[0]
		if RC_readname == '':
			get_next_set_RC_reads = False
			continue
		RC_info = get_all_same_reads(RC_readname,input_RC_fh,RC_line)
	R_lines = R_info[0] #lines
	R_line = R_info[1] #line
	#R_next_readname = R_info[3] #current_readname
	RC_lines = RC_info[0] #lines
	RC_line = RC_info[1] #line
	#RC_next_readname = RC_info[3] #current_readname
	#let's see if the readnames match or if not see which is earlier in the alphabet
	if R_readname == RC_readname:
		for line in R_lines:
			output_agglomerate_fh.write(line)
		for line in RC_lines:
			output_agglomerate_fh.write(line)
		get_next_set_R_reads = True
		get_next_set_RC_reads = True
	elif R_readname < RC_readname:
		for line in R_lines:
			output_agglomerate_fh.write(line)
		get_next_set_R_reads = True
		get_next_set_RC_reads = False
	elif R_readname > RC_readname:
		for line in RC_lines:
			output_agglomerate_fh.write(line)
		get_next_set_R_reads = False
		get_next_set_RC_reads = True
	else:
		sys.stderr.write( "cannot figure out which is alphabetically greater "+R_readname+' or '+RC_readname+'\n')
		sys.exit(1)
	

input_R_fh.close()
input_RC_fh.close()
output_agglomerate_fh.close()

#next you should check that the agglomerate file is indeed sorted
try:
	output_agglomerate_fh = open(output_agglomerate_file,'r')
except IOError:
	sys.stderr.write( "cannot open "+output_agglomerate_file+'\n')
	sys.exit(1)

line1=output_agglomerate_fh.readline()
for line in output_agglomerate_fh:
	readname1 = line1.split('\t')[0]
	readname = line.split('\t')[0]
	if readname1 > readname:
		sys.stderr.write( "something wrong with the order of the sam file\n")
		sys.stderr.write( readname1+' should not lexicographically be before '+readname+'\n')
		sys.exit(1)
	line1 = line

output_agglomerate_fh.close()

sys.stderr.write( 'job completed looking at '+input_R_file+' and '+input_RC_file+'\n')
