'''Takes the combined CRE-seq read counts for multiple samples after merge_TECRE_sample_BC_counts.py script and normalizes to counts per million. 
Removes any BC with less than 5 counts in any replicate.
Outputs a file (normalized_cpm_CRE-seq_file) with BC, CRE, and each sample normalized counts.
Usage: python norm_cpm_CRE-seq.py <input file: TECRE_BC_counts_merged> <output file>'''

import sys

if len(sys.argv) != 3:
	sys.exit(__doc__)

count_file = sys.argv[1]

header = []
contents = []
sums = []

with open (count_file, 'r') as f:
	first_line = f.readline()
	fields = first_line.rstrip('\n').split('\t')
	for i in fields:
		header.append(i)
		contents.append([])
	for line in f:
		count_less_than_five = 0
		fields = line.rstrip('\n').split('\t')
		for i in range(len(fields) - 2):
			if len(sums) < (len(fields) - 2):
				sums.append(0)
			sums[i] += int(fields[i+2])
			if int(fields[i+2]) < 5:
				count_less_than_five = 1
		if count_less_than_five == 1:
			continue
		for i in range(len(fields)):
			if i == 0 or i == 1:
				contents[i].append(fields[i])
			else:
				contents[i].append(int(fields[i]))

new_contents = []
for i in range(len(contents)):
	if i == 0 or i == 1:
		new_contents.append(contents[i])
	else:
		new_contents.append([x / (float(sums[i-2]) / 1000000) for x in contents[i]])

with open (sys.argv[2], 'w') as f:
	line = ''
	for value in header:
		line = line + value + '\t'
	f.write(line.rstrip('\t') + '\n')
	for i in range(len(new_contents[0])):
		line = ''
		for count_list in new_contents:
			line = line + str(count_list[i]) + '\t'
		f.write(line.rstrip('\t') + '\n')

