from gfpvar.tools.common import *
from leo.common import *
import scipy as SP
import glob
import os
import sys
import pdb

def read_pileup(filename, pileupfile, base_qual_cutoff=20, map_qual_cutoff=20):
    if os.path.exists(filename):
        return cl(filename)
    else:
        return create_pileup(pileupfile, filename, base_qual_cutoff, map_qual_cutoff)


def combine_pileups(p1, p2):
    import time
    t = time.time()
    Q, rs, L, I = [],[],[], [{},{}]
    q1,q2, rs1, rs2, d1, d2, l1, l2 = p1['Q'], p2['Q'], p1['refseq'], p2['refseq'], p1['D'], p2['D'], [tuple(l) for l in p1['L']], [tuple(l) for l in p2['L']]
    locs,qs,ds,rss = [l1,l2], [q1,q2], [d1,d2], [rs1,rs2]

    common = set(l1) | set(l2)
    for l in common:
        for i in range(2): I[i][l] = False
    for i in range(2):
        for l in locs[i]:  I[i][l] = True
    D = SP.zeros([len(common), 5], int)

    for l, loc in enumerate(sorted(common)):
        L.append(loc)
        Q.append([[],[],[],[],[]])
        seq = None
        for i in range(2):
            if I[i][loc]:
                idx = locs[i].index(loc)
                for j in range(5):
                    Q[-1][j].extend(qs[i][idx][j])
                seq = rss[i][idx]
                D[l] += ds[i][idx]
        rs.append(seq)

    print len(l1), len(l2), time.time() - t
    return {'D':D, 'refseq':rs, 'L':L, 'Q':Q}
    #pdb.set_trace()
    

def create_pileup(pileupfile, outfilename, base_qual_cutoff=20, map_qual_cutoff=20):
    sample = pileupfile.split("/")[-3]
    allele_index = {'A':0, 'C':1, 'G':2, 'T':3, '*':4}
    #l.info('Read pileup:%s'%(sample))
    D = [] # alleles
    Q = [] # quality
    MQ = [] # mapping quality
    L = [] # loci
    refseq = [] # reference at the loci

    # read through pileup
    if not os.path.exists(pileupfile): return
    pfh = file(pileupfile, 'r')
    index = 0

    for line in pfh:
        d = line.strip().split()
        if d[2] == "*": continue
        #if int(d[1]) == 235798: 
        #    print d
        #    pdb.set_trace()
        L.append([d[0], int(d[1])])  # init new site
        refseq.append(d[2].upper())
        D.append([0]*len(allele_index))
        Q.append([[] for i in range(len(allele_index))])
        MQ.append([[] for i in range(len(allele_index))])

        data, base_qual = d[8:10]         # init data values
        i, j = 0, 0

        while i < len(data): # for the mapped reads, parse alignment
            if data[i] in '+-': # insertion or deletion definition
                length = 0 # parse length from ensuing numbers
                while data[i + 1] in '0123456789' and i + 1 < len(data):
                    length *= 10
                    length += int(data[i + 1])
                    i += 1
                i += length
            elif data[i] == '$': pass # read beginning, end, and deletion markers markers
            elif data[i] == '^': i += 1
            else: # actual base
                base = data[i].upper()
                if data[i] in ',.': base = d[2].upper() # take reference as the base if ., in alignment
                if ord(base_qual[j]) - 33 > base_qual_cutoff :
                    D[index][allele_index[base]] += 1
                    Q[index][allele_index[base]].append(ord(base_qual[j]) - 33)
                j += 1 # only move qual for actual bases
            i += 1 # move on in pileup for this locus

        if sum(D[index]) == 0: 
            for x in [D,Q,MQ,L,refseq]: x.pop()     # if no bases left, don't add the base
        else: 
            index += 1
    pfh.close()

    # create and return result hash
    result = {'D': SP.array(D), 'Q':Q, 'L':L, 'refseq':refseq}
    cdm(result, outfilename)
    return result


def main():
    if len(sys.argv) == 1: 
        for f in glob.glob("%s/seq/Sample4_*/2_pileup/*L001.pileup"%DATA_DIR):
                print f
            #if not os.path.exists(f.replace("_L001.pileup", ".combined.pickle")):
                os.system("submitjob python run_4_combine_pileup.py %s"%f)

        return

    f4 = sys.argv[1]
    f3 = f4.replace("Sample4", "Sample3")
    ofn3 = f3.replace("_L001.pileup", ".combined_initial.pickle")
    ofns = [ofn3, ofn3.replace("Sample4", "Sample3")]    
    for i,f in enumerate([f3,f4]):
        sample = f.split("/")[-3]
        p1 = read_pileup(f + ".pickle", f)
        p2 = read_pileup(f.replace("L001", "L002") + ".pickle", f.replace("L001", "L002"))
        cdm(combine_pileups(p1, p2), ofns[i])
    p = [read_pileup(ofn, ofn) for ofn in ofns]
    cdm(combine_pileups(*p), ofns[1].replace(".combined_initial.pickle", ".combined.pickle"))


if __name__ == '__main__':
    main()
