import sys
import struct
import os
import glob
import scipy as SP
import scipy.stats as ST
import pdb
from common import *


def get_array_data(skip_nan=True, debug_skipped=False):
    d = SP.loadtxt("%s/ref/plate_orfs_gfp.tab"%DATA_DIR, dtype=object, delimiter="\t")
    data = SP.zeros([17,17,25,4], dtype=object)*SP.nan
    platestrs = map(str, range(18))
    nans = ['low signal', 'not visualized', 'technical problem']
    for i in range(len(d)):
        if d[i,5] in platestrs:
            plate = int(d[i,5])
            row = int(d[i,6])
            col = int(d[i,7])
            if d[i,9] in nans and skip_nan:
                if debug_skipped: print "skipping", d[i]
                continue
            data[plate,row,col] = (d[i,0], d[i,8],d[i,9],d[i,10]) # SGD name, common name, gfp level, localisation
        else:
            print "not in plates", d[i]
    return data



def get_array_gene_gfp(skip_nan=True, debug_skipped=False):
    d = get_array_data(skip_nan, debug_skipped)
    res = {}
    for i in range(d.shape[0]):
        for j in range(d.shape[1]):
            for k in range(d.shape[2]):
                if str(d[i,j,k][2]) != "nan":
                    res[d[i,j,k][0]] = float(d[i,j,k][2])
    return res


#YJL092W	GFP(+)02	A3	A	3	1	1	6	HPR5	5190	nucleus
class cGene:
    def __init__(self, line):
        d = line.strip().split('\t')
        self.id = d[0]
        self.name = d[8]
        if len(self.name) < 2: self.name = None
        self.mRNA = None
        self.mRNA_h2 = None
        if len(d[9]) < 2 or d[9][0:3] == "not" or d[9][0:3] == "low" or d[9][0:3] == "tec": self.gfp_ref = None 
        else: self.gfp_ref = float(d[9])
        self.gfp_by_mean = None
        self.gfp_by_mode = None
        self.gfp_by_median = None
        self.gfp_by_var = None
        self.gfp_rm_mean = None
        self.gfp_rm_mode = None
        self.gfp_rm_median = None
        self.gfp_rm_var = None
        self.gfp_h2 = None
        self.compartment = d[10]
        self.plate96 = d[1]
        self.plate384 = d[5]
        self.coord96 = d[2]
        self.coord384 = "r%s_c%s"%(d[6],d[7])
    
    def __str__(self):
        return "%s (%s) - 384-%s %s / 96-%s %s, mRNA=%s, GFP=%s (%s)"%(self.id, self.name, self.plate384, self.coord384, self.plate96, self.coord96, self.mRNA, self.gfp, self.compartment)

    def h2str(self):
        return "%s\tBy:%.2f(%.2f)\tRm:%.2f(%.2f)\tSeg:%.2f(%.2f)\th2:%.2f %.2f"%(self.id, self.mRNA[0], self.mRNA[1]**0.5, self.mRNA[2], self.mRNA[3]**0.5, self.mRNA[4], self.mRNA[5]**0.5, self.mRNA_h2[0], self.mRNA_h2[1])



def read_gfp_summary(set, combine_method="mixture-cutoff-3.2"):# combination="mixture-dict-all"): #
    d = cl("%s/cytometry/Pilot_screen_BYxRM/plate_stats/combined/%s_%s.pickle"%(DATA_DIR, combine_method, set))
    if set.count("Candidate") > 0:
        for i in range(len(d['wells'])):
            d['wells'][i] = d['wells'][i].split("_")[-2][-1:] + "_" + d['wells'][i].split("_")[-1]
    if set.count("-rev") > 0: # if the plate is flipped
        for i in range(len(d['wells'])):
            subplate, well = d['wells'][i].split("_")
            subplate = "ABCD"[ord('D') - ord(subplate[0])]
            row = "ABCDEFGH"[ord('H') - ord(well[0])]
            col = 13 - int(well[1:])
            d['wells'][i] = "%s_%s%d"%(subplate, row, col)
            #        pdb.set_trace()
    return d



def read_segregant_expr(filename='%s/mrna/pbiol_segregants.tab'%DATA_DIR):
    expr = {}
    ifh = file(filename, 'r') # 1. segregants
    strains = ifh.next().strip().split('\t')[2:] # strain names 
    I = [x for x in range(len(strains)) if strains[x].count("glucose") > 0] # filtered for growth in glucose    

    for l in ifh: 
        d = SP.array(l.strip("\n").split('\t'))
        meta = d[1].split() # get gene information
        gene_name = None
        for m in meta:
            m = m.strip()
            if (m[0] == "Y") and (m[2] in "LR"): gene_name = m
        d[SP.where(d == "")[0]] = SP.nan # fill empty spaces with NaN                
        expr[gene_name] = SP.array(d[2:], float)[I] # store mean
        
    return expr, SP.array(strains)[I]


""" Strain = || 102E 21_2_d YLK511 1% eth      251338410284_1_2"""
def read_strain_snps(strains):
    snps = cl("%s/mrna/snps_segregants.pickle"%DATA_DIR)
    snploc = cl("%s/mrna/snploc.pickle"%DATA_DIR)
    inds = cl("%s/mrna/individuals_segregants.pickle"%DATA_DIR)
    strains = [strains[i].split()[1] for i in range(len(strains))]
    strain_idx = []
    for i in range(len(strains)):
        n_done = 0
        for j in range(len(inds)):
            if inds[j].count(" %s "%(strains[i])) > 0 and inds[j].count("glu") > 0:
                strain_idx.append(j)
                n_done += 1
        if n_done != 1:
            pdb.set_trace()
            pass
    return snps[strain_idx,:,:], snploc



def read_gene_common_name():
    ifh = file("%s/mrna/genome.gff"%DATA_DIR,'r')
    gene_hash = {}
    
#chr01   SGD     gene    538     792     .       +       .       ID=YAL068W-A;Name=YAL068W-A;Ontology_term=GO:0003674,GO:0005575,GO:0008150;Note=Dubious%20open%20reading%20frame%20unlikely%20to%20encode%20a%20protein%3B%20identified%20by%20gene-trapping%2C%20microarray-based%20expression%20analysis%2C%20and%20genome-wide%20homology%20searching;dbxref=SGD:S000028594;orf_classification=Dubious
    for line in ifh:
        chrm, start, end, type, strand, meta = SP.array(line.strip().split('\t'))[[0, 3, 4, 2, 6, 8]]
        if chrm.count("m") > 0: chrm = 17
        else: chrm = int(chrm[3:])
        common_name = None
        sgd_orf = None
        if type == 'gene':
            for param in meta.split(';'):
                var, val = param.split('=')
                if var == 'gene':
                    common_name = val
                elif var == 'Name':
                    sgd_orf = val
        gene_hash[sgd_orf] = common_name if (common_name is not None) else sgd_orf
                
    return gene_hash




def read_common_name_orf():
    ifh = file("%s/mrna/genome.gff"%DATA_DIR,'r')
    orf_hash = {}
    
#chr01   SGD     gene    538     792     .       +       .       ID=YAL068W-A;Name=YAL068W-A;Ontology_term=GO:0003674,GO:0005575,GO:0008150;Note=Dubious%20open%20reading%20frame%20unlikely%20to%20encode%20a%20protein%3B%20identified%20by%20gene-trapping%2C%20microarray-based%20expression%20analysis%2C%20and%20genome-wide%20homology%20searching;dbxref=SGD:S000028594;orf_classification=Dubious
    for line in ifh:
        chrm, start, end, type, strand, meta = SP.array(line.strip().split('\t'))[[0, 3, 4, 2, 6, 8]]
        if chrm.count("m") > 0: chrm = 17
        else: chrm = int(chrm[3:])
        common_name = None
        sgd_orf = None
        if type == 'gene':
            for param in meta.split(';'):
                var, val = param.split('=')
                if var == 'gene':
                    common_name = val
                elif var == 'Name':
                    sgd_orf = val
        if common_name is not None:
            orf_hash[common_name] = sgd_orf
                
    return orf_hash



def read_gene_locs(return_strand=False, use_common_name=False):
    ifh = file("%s/mrna/genome.gff"%DATA_DIR,'r')
    gene_hash = {}
    
#chr01   SGD     gene    538     792     .       +       .       ID=YAL068W-A;Name=YAL068W-A;Ontology_term=GO:0003674,GO:0005575,GO:0008150;Note=Dubious%20open%20reading%20frame%20unlikely%20to%20encode%20a%20protein%3B%20identified%20by%20gene-trapping%2C%20microarray-based%20expression%20analysis%2C%20and%20genome-wide%20homology%20searching;dbxref=SGD:S000028594;orf_classification=Dubious
    for line in ifh:
        chrm, start, end, type, strand, meta = SP.array(line.strip().split('\t'))[[0, 3, 4, 2, 6, 8]]
        if chrm.count("m") > 0: chrm = 17
        else: chrm = int(chrm[3:])
        common_name = None
        sgd_orf = None
        if type == 'gene':
            for param in meta.split(';'):
                var, val = param.split('=')
                if var == 'gene':
                    common_name = val
                elif var == 'Name':
                    sgd_orf = val
            name = common_name if (use_common_name and (common_name is not None)) else sgd_orf
            if return_strand: gene_hash[name] = (chrm, int(start), int(end), strand)
            else:  gene_hash[name] = (chrm, int(start), int(end))
                
    return gene_hash


def read_sequence():
    res = {}
    
    for filename in glob.glob("%s/ref/chr*.fsa"%DATA_DIR):
        chrm = filename.split("/")[-1].split(".")[0]
        seq = ""
        for l in file(filename, 'r'):
            if l[0] == ">": continue
            seq = seq + l.strip()
        res[chrm] = seq
    return res


def read_seg_expr(segregant_filename="%s/mrna/pbiol_segregants.tab"%DATA_DIR, picklefile=None, condition="glucose"):
    ifh = file(segregant_filename, 'r') # 1. segregants
    strains = ifh.next().strip().split('\t')[2:] # strain names 
    I = [x for x in range(len(strains)) if strains[x].count(condition) > 0] # filtered for growth in glucose
    gene_expr = {}

    for l in ifh: 
        d = SP.array(l.strip("\n").split('\t'))
        gene_name = None
        for m in d[1].split(): # for fields in metadata
            m = m.strip()
            if m[0] == "Y": gene_name = m
                
        d[SP.where(d == "")[0]] = SP.nan # fill empty spaces with NaN
        gene_expr[gene_name] = SP.array(d[2:], float)[I]  # values filtered to correct strains

    if picklefile is not None: # store convenience form if desired
        cdm((SP.array(strains)[I], gene_expr), picklefile)
        
    return SP.array(strains)[I], gene_expr



def read_expr(segregant_filename='%s/mrna/pbiol_segregants.tab'%DATA_DIR, parent_filename="%s/mrna/pbiol_parents.tab"%DATA_DIR):
    mean_expr = {'seg':{}, 'BY':{}, 'RM':{}}
    var_expr = {'seg':{}, 'BY':{}, 'RM':{}}

    ifh = file(segregant_filename, 'r') # 1. segregants
    strains = ifh.next().strip().split('\t')[2:] # strain names 
    I = [x for x in range(len(strains)) if strains[x].count("glucose") > 0] # filtered for growth in glucose    

    for l in ifh: 
        d = SP.array(l.strip("\n").split('\t'))

        meta = d[1].split() # get gene information
        gene_name = None
        for m in meta:
            m = m.strip()
            if m[0] == "Y": gene_name = m
                
        d[SP.where(d == "")[0]] = SP.nan # fill empty spaces with NaN
        vals = SP.array(d[2:], float)[I] # values filtered to correct strains
        mean_expr['seg'][gene_name] = ST.nanmean(vals) # store mean
        var_expr['seg'][gene_name] = ST.nanstd(vals)**2 # variance


    ifh = file(parent_filename, 'r') # 2. parentals
    strains = ifh.next().strip().split('\t')[1:] # strain names
    I = {} # indexes for list of strains corresponding to each parent
    for p in 'BY','RM':  I[p] = [x for x in range(len(strains)) if (strains[x][-1] == "g") and (strains[x][0:2] == p)] # filtered for growth in glucose    
    for i in range(4): ifh.next() # redundant lines
    for l in ifh: 
        d = SP.array(l.strip().split('\t'))
        gene_name = d[0] # get gene information
        d[SP.where(d == "")[0]] = SP.nan # fill empty spaces with NaN
        for p in 'BY','RM':
            vals = SP.array(d[1:], float)[I[p]] # get parent values from line
            mean_expr[p][gene_name] = ST.nanmean(vals) # mean of values filtered to correct strains
            var_expr[p][gene_name] = ST.nanstd(vals)**2 # mean of values filtered to correct strains

    genes = {} # construct a single list of aligned genes for all sets (segregants, each parent)
    means = {'seg':[], 'BY':[], 'RM':[]}
    vars = {'seg':[], 'BY':[], 'RM':[]}
    i = 0
    for g in sorted(set(mean_expr['seg'].keys()) & set(mean_expr['BY'].keys())): # for all genes that are in both segregants and parents
        for k in means:
            means[k].append(mean_expr[k][g]) # add the values to the lists
            vars[k].append(var_expr[k][g])
        genes[g] = i
        i += 1
    for k in means: # update lists to arrays
        means[k] = SP.array(means[k])
        vars[k] = SP.array(vars[k])
        
    return genes, means, vars
    

def get_seq_chr_names():
    ifh = file("%s/seq/ref/chr_names.tab"%DATA_DIR, "r")
    res = {}
    for l in ifh:
        d = l.strip().split()
        res[d[0][1:]] = d[1].split("=")[1][0:-1]
    return res
