import scipy as SP
import pylab as PL
import os
import glob
import pdb
from parts2014_gfpvar.tools.common import *
from parts2014_gfpvar.tools.io import *


FULL_FEATURES = [23,24,27,28,29,30,31,32,33,34,35,36,41,42,43,44,45,46,51,52,57,58,59,60,61,62,63,64,66,67,84,85,97,98,99,100,101,102] # all potentially interesting features in the array output
CHR_NAMES = get_seq_chr_names()
ROMAN_CHR = "0 I II III IV V VI VII VIII IX X XI XII XIII XIV XV XVI mitochondrion".split(" ")
NCBI_CHR_NAMES = {}
for c in CHR_NAMES: NCBI_CHR_NAMES[str(ROMAN_CHR.index(CHR_NAMES[c]))] = c


""" Read samples for given dataset
@param dataset
@require "%s/arrays/samples/%s.tab"%(DATA_DIR, dataset) to exist
@return (array ID, hyb) => [sample, sample nospaces, gfp low or high, ORF name, ORF name aliases]
"""
def read_samples(dataset="2013-02-22_round1"):
    ifh = file("%s/arrays/samples/%s.tab"%(DATA_DIR, dataset), 'r')
    res = {}
    for l in ifh:
        d = l.strip().split("\t")
        res[(d[0], d[1])] = d[2:]
    return res # map of array ID, hyb => sample, sample nice, gfp low or high, ORF name, aliases


def get_sample_id(sample, dataset):
    ifh = file("%s/arrays/samples/%s.tab"%(DATA_DIR, dataset), 'r')
    for l in ifh:
        d = l.strip().split("\t")
        if d[3] == sample: return "%s_%s"%(d[0], d[1])
    return None


""" Condense array file into a limited one.
@param infilename raw array output
@param outfilename tab-delimited output file name
@param full True if condensed file will have a lot of information (tens of columns on spot intensities etc). If False, output will have limited columns
@require exists(infilename), exists(outfilename)
"""
def create_condensed_tab_file(infilename, outfilename, full=False):
    LOG.debug("Condensing %s to %s"%(infilename, outfilename))
    ifh = file(infilename, 'r')
    ofh = file(outfilename + ".tmp", 'w')
    for i in range(9): ifh.next() # first 9 lines are array metadata, not spot info
    headers = SP.array(ifh.next().split("\t")) # spot features
    if not full:    ofh.write("Chromosome\tLocation\tProbetype\tStrand\tlog(BY/RM)\tse(log(BY/RM))\n") # write headers based on whether full output desired
    else:           ofh.write("Chromosome\tLocation\tProbetype\tStrand\tlog(BY/RM)\tse(log(BY/RM))\t" + "\t".join(headers[FULL_FEATURES]) + "\n")

    for l in ifh:
        d = l.strip().split("\t") # get spot data
        probename = d[11]
        if (probename in ["DarkCorner", "NegativeControl"]) or (probename[0:2] in ["NC", "SM"]): continue # skip probes that are controls or something else
        chrm, loc, parent = probename.split("_") # chr8_337367Bu_RM => chr8, 337367Bu, RM
        strand = loc[-1:] # u or d
        loc = int(loc[0:-2]) # '337367Bu' => 337367
        val = float(d[16]) # log(sample intensity) - log(BY/RM diploid hybrid intensity)
        val_err = float(d[17]) # se(val)
        if not full:
            ofh.write("%s\t%d\t%s\t%s\t%.4f\t%.4f\n"%(chrm[3:],loc,parent,strand, val, val_err))
        else:
            ofh.write("%s\t%d\t%s\t%s\t%.4f\t%.4f\t%s\n"%(chrm[3:],loc,parent,strand, val, val_err, "\t".join(SP.array(d)[FULL_FEATURES])))
        
    ofh.close()
    os.system("sort -nk2 %s.tmp | sort -nsk1 > %s; rm %s.tmp"%(outfilename,outfilename,outfilename)) # retain the output sorted by chromosome and location


""" Convenience method to create a set of condensed tab files for many arrays
@param dataset dataset to be used
@param pattern string that sample files must include in order to be processed. If None, all files in the dataset directory will be processed
@param full - if True, full output is created for each file
"""
def create_condensed_tab_files(dataset="2013-02-22_round1", pattern=None, full=False):
    indir = "%s/arrays/raw/%s/txt"%(DATA_DIR, dataset)
    outdir = "%s/arrays/condensed/%s"%(DATA_DIR, dataset)
    if full: outdir = "%s/arrays/full/%s"%(DATA_DIR, dataset)
    samples = read_samples(dataset=dataset)
    
    os.system("mkdir -p %s"%outdir)
    files = glob.glob("%s/*.txt"%(indir))
    LOG.debug("Condensing files from %s - %d total"%(indir, len(files)))
    
    for f in files: # for each .txt file in dataset input directory
        fields = f.split("/")[-1].split("_")
        array, hyb = fields[1], fields[-1][0:1] # get the array ID and hyb number from the filename
        sample = samples[(array, hyb)][1] # match these numbers to a sample
        if (pattern is not None) and (sample.count(pattern) == 0): continue # don't process samples that don't match the pattern (if one given)
        create_condensed_tab_file(f, "%s/%s.tab"%(outdir, sample), full) # condense the output



""" Read a condensed array file, and return the relevant features (locations of probes, log-ratios, standard errors of ratios)
@param sample name of sample to load
@param dataset name of dataset to load sample from
@param full whether to read full array file
@return (locations of probes, log-ratios, standard errors of ratios)
All return parameters are maps {chrm:SP.array(vals)}. The log-ratios are BY signal - RM signal; standard errors are se(BY signal) + se(RM signal).
"""
def read_array_file(sample, dataset="2013-02-22_round1", full=False):
    ifh = file("%s/arrays/condensed/%s/%s.tab"%(DATA_DIR, dataset, sample), 'r')
    if full: return read_full_array_file(sample, dataset)
    ifh.next()
    vals = {}
    errs = {}
    for l in ifh:
        d = l.strip().split()
        chrm, loc, parent, strand, val, err = d[0:6]
        loc = int(loc)
        if chrm not in vals: vals[chrm] = {}
        if loc not in vals[chrm]: vals[chrm][loc] = SP.zeros([2,2,2])*SP.NAN
        vals[chrm][loc][0,parent=="BY", strand=="u"] = float(val)
        vals[chrm][loc][1,parent=="BY", strand=="u"] = float(err)

    locs, res, err = {}, {}, {}
    for chrm in vals:
        locs[chrm] = SP.array(sorted(vals[chrm].keys()))
        res[chrm] = SP.zeros(len(locs[chrm]))
        err[chrm] = SP.zeros(len(locs[chrm]))
        
        for i,loc in enumerate(locs[chrm]):
            res[chrm][i] = SP.nanmean(vals[chrm][loc][0,0] - vals[chrm][loc][0,1])
            err[chrm][i] = SP.nanmean(vals[chrm][loc][1,0] + vals[chrm][loc][1,1])
    return locs, res, err



""" Read allele frequencies for sample from given dataset
@param sample name of sample
@param dataset dataset used
@return map chrm->([locs], [afs]) . E.g. "A8"->([121,124,155,...], [0.44, 0.46, 0.55, ...])
"""
def read_sample_af(sample, dataset):
    afs = {}
    for f in glob.glob("%s/arrays/inferred_af/%s/%s/chrm_*.tab"%(DATA_DIR, dataset, sample)):
        ifh = file(f, 'r')
        chrm = f.split("_")[-1][0:-4]
        afs[chrm] = [],[]
        for l in ifh:
            if l[0] == "#": continue
            d = l.strip().split()
            afs[chrm][0].append(int(d[1]))
            afs[chrm][1].append(float(d[2]))
    return afs



""" Output the inferred mean and variance in probe intensities
@param dataset dataset for the samples
@param samples list of samples the signals are for
@param chrm chromosome of the signals
@param locs locations of the probes
@param signals list of signals corresponding to samples
"""
def output_signal_file(dataset, samples, chrm, locs, raw, raw_var, signals):

    for s in range(len(samples)):
        outdir = "%s/arrays/inferred_af/%s/%s"%(DATA_DIR, dataset, samples[s])
        if not os.path.exists(outdir): os.system("mkdir -p %s"%outdir)
        ofh = file("%s/chrm_%s.tab"%(outdir, chrm), 'w')
        ofh.write("#Chrm\tloc\traw probe signal\tSD(raw probe signal)\tinferred(probe_signal)\tSD(inferred probe_signal)\tp(probe_is_foreground)\n")
        for l in range(len(locs)):
            ofh.write("%s\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\n"%(ROMAN_CHR[int(chrm)], locs[l], raw[s][l], raw_var[s][l]**0.5, signals[s][0][l], signals[s][1]**0.5, signals[s][2][l][2]))
        ofh.close()


def read_array_data_for_qtlcall(sample_id):
    return cl("%s/arrays/inferred_af/pickles/%s.pickle"%(DATA_DIR, sample_id))


""" Write observed allele and inferred allele frequency data for a sample into a file
@param sample sample to write
@effects creates file in arrays/txt_output/[sample name]_af.tab which contains segregating loci, coverage, observed alleles, posterior allele frequency """
def write_sample_array_data(sample):
    data = read_array_data_for_qtlcall(sample)
    chrnames = get_seq_chr_names()
    (samplename, rep, gfppop, orf, assay, diagnosis, comment, discovery_set) = get_all_metadata(sample_set="seq")[sample]
    if comment == "discard" or discovery_set[0:4] == "none":
        LOG.debug("Skipping sample %s (comment=%s, discovery set=%s)"%(sample, comment, discovery_set))
	return
    LOG.debug("Outputting data for %s"%sample)
    ofh = file("%s/arrays/txt_output/%s_%s_%s_af.tab"%(DATA_DIR, orf, gfppop.replace("_","-"), rep[-2:]), 'w')
    ofh.write("#Chrm\tLoc\tf_ML\tf_est\tvar(f_est)\tbad_locus?\n")
    for chrm in sorted(data):
        ml_m, post_m, post_params, bad_loc, loci, coverage = data[chrm]
        sd_ml, sd_post = ((1-ml_m)*ml_m/coverage)**0.5, (post_params.prod(axis=1)**0.5)/post_params.sum(axis=1)/((post_params.sum(axis=1) + 1)**0.5)
                                                                
        for i in range(len(loci)):
            ofh.write("%s\t%d\t%.3f\t%.3f\t%.3f\t%s\n"%(chrnames[chrm], loci[i], ml_m[i], post_m[i], sd_post[i], str(bad_loc[i])))
    ofh.close()


""" Get metadata for all samples from file seq/samples/meta.tab
@param sample_set set of samples to consider (from arrays, sequencing, or QTL mapping (that includes both))
@return map sample->[metadata]
"""
def get_all_metadata(sample_set="qtl"):
    data = [l.strip().split("\t") for l in file('%s/%s/samples/meta.tab'%(DATA_DIR, sample_set),'r')]
    result = {}
    for d in data: result[d[0]] = d[1:]
    return result
