#!/usr/bin/env python
import numpy as np
from sys import argv
from collections import OrderedDict

libsize = float(argv[1])
nbins = int(argv[2])
upstream = argv[3]
downstream = argv[4]
out = argv[5]

# libsize = float(13541232)
# nbins = 100
# upstream = "BCL3.t2_rep3.in.protein_coding.filtered_by_expression.all_upstream_flanks.txt"
# downstream = "BCL3.t2_rep3.in.protein_coding.filtered_by_expression.all_downstream_flanks.txt"
# out = "BCL3.t2_rep3.in.protein_coding.filtered_by_expression.1000bp_bins_to_100000bp.npy"

##########
# process upstream bin data

counts_by_gene = OrderedDict([])
with open(upstream, "r") as f:
    h1 = next(f); h2 = next(f)
    for line in f:
        gene_flankbin,_,_,_,_,_,count = line.strip().split("\t")
        gene,flankbin = gene_flankbin.split("_")
        if gene not in counts_by_gene:
            counts_by_gene[gene] = [int(count)]
        else:
            counts_by_gene[gene].append(int(count))

upstream_counts = np.zeros((len(counts_by_gene), nbins))
for i,gene in enumerate(counts_by_gene.keys()):
    counts = counts_by_gene[gene]
    # some genes may be near chromosome ends, in which case
    # add pseudocounts past the chromosome so that everything lines up correctly
    for j in range(nbins - len(counts)):
        counts.insert(0, x[0])
    
    # each bin counts within a particular 1000 bp bin as well as
    # all counts from the directly smaller bin (nested bins)
    # thus, substract all counts from the smaller bin to get
    # the counts in the exact 1000 bp of interest
    upstream_counts[i,:] = [x-counts[k+1] if k < len(counts)-1 else x for k,x in enumerate(counts)]
    
##########
# process downstream bin data

counts_by_gene = OrderedDict([])
with open(downstream, "r") as f:
    h1 = next(f); h2 = next(f)
    for line in f:
        gene_flankbin,_,_,_,_,_,count = line.strip().split("\t")
        gene,flankbin = gene_flankbin.split("_")
        if gene not in counts_by_gene:
            counts_by_gene[gene] = [int(count)]
        else:
            counts_by_gene[gene].append(int(count))

downstream_counts = np.zeros((len(counts_by_gene), nbins))
for i,gene in enumerate(counts_by_gene.keys()):
    counts = counts_by_gene[gene][::-1]
    # some genes may be near chromosome ends, in which case
    # add pseudocounts past the chromosome so that everything lines up correctly
    for j in range(nbins - len(counts)):
        counts.insert(0, x[0])
        
    # each bin counts within a particular 1000 bp bin as well as
    # all counts from the directly smaller bin (nested bins)
    # thus, substract all counts from the smaller bin to get
    # the counts in the exact 1000 bp of interest
    downstream_counts[i,:] = [x-counts[k+1] if k < len(counts)-1 else x for k,x in enumerate(counts)]

all_counts = np.hstack([upstream_counts, np.fliplr(downstream_counts)])
CPM = all_counts / (libsize / 10**6)
np.save(out, CPM)