import scipy as SP
import pylab as PL
import glob
import time
from parts2014_gfpvar.tools.common import *
from annotation import *
from cluster import cClusteringSettings, DEFAULT_COLNAMES, DEFAULT_FILTERS
from util import *

BAD_TAP_SIGNAL = ['technical problem','low signal', 'not visualized']
DEFAULT_FSCA_SLICE = (0,1e8)

DEFAULT_FEATURES = ['mean','median','var']


""" This file holds class definitions for holding data about cytometry screens, and wrappers for creating
both screen level and plate level summaries. A lot of the statistical heavy lifting is done in the cluster.py file

There are two main functionalities provided by the extra wrappers. First, process_sceen uses create_plate_stats to
go from raw .fcs files to plate level summaries that include data from all the wells. Read_screen_results collates these
files into a single object for downstream analyses.
""" 



""" Class for holding results for an entire screen. In addition to screen metadata (experiment name, screen name, list of plate names)
it hold a map of ORFs to their biological (one per plate) and technical (multiple occurrences on one plate) replicas (cPlateOrf objects).
In addition there is a map of plate name to cScreenPlate objects that also hold all the same data.
""" 
class cScreenResults:

    """ Constructor. Load metadata from experiment, screen and settings. Create orf->[biological replicates PlateOrf object] map
        and plate name -> ScreenPlate object map.
    """
    def __init__(self, experiment, screen, plates, settings):
        self.screen = screen
        self.experiment = experiment
        self.settings_string = settings
        self.settings = cClusteringSettings(settings)
        self.plates, self.plate_means = {}, {}
        for p in plates:
            self.plates[p.plate] = p
        
        # create orf -> list of biological replicates map from given data
        self.orf_stats = {}
        for plate in plates:
            all_types = SP.unique(" ".join([" ".join([x.replace("/", " ") for x in ostats.type.values()]) for ostats in plate.orf_stats.values()]).split(" "))
            for orf in plate.orf_stats:
                stat = plate.orf_stats[orf]
                stat.calc_stats(all_types)
                if orf not in self.orf_stats:
                    self.orf_stats[orf] = []
                self.orf_stats[orf].append(stat) # information about the plate is already embedded in the stats object, so not repeated again.

    def __str__(self):
        return "Experiment: %s\n"%self.experiment + \
               "Screen    : %s\n"%self.screen + \
               "No. plates: %d\n"%(len(self.plates)) + \
               "No. ORFs  : %d\n"%(len(self.orf_stats)) + \
               "Settings  : %s"%self.settings_string


""" Class for holding and manipulating results from a single plate of one screen.
Many filtering and normalisation steps take place at the plate level, thus several instance
methods are provided for these purposes.
""" 
class cScreenPlate(object):

    """ Constructor. Assume platefile filename begins with stats-; followed by actual plate ID.
    Create a container for information about orfs on the plate. Container has metadata on the
    experiment, screen, plate, and analysis settings, and a map from ORF names to cPlateOrf objects
    """
    def __init__(self, experiment, screen, platefile, settings):
        plate = platefile.split("/")[-1].split(".")[0][6:] # from '.../stats-Hap1-P8AB.pickle' to 'Hap1-P8AB'
        # Save metadata
        self.screen = screen
        self.experiment = experiment
        self.settings_string = settings
        self.plate = plate

        annot = read_plateannot(plate, screen, experiment) # row,col -> annotation
        stats = read_platestats(plate, settings, screen, experiment) # list of wells, list of combined statistics
        self.orf_stats = {}
        mean_values = []
        #pdb.set_trace()
        
        # Create ORF->stats map to quickly retrieve ORF information from plate
        for i,well in enumerate(stats['wells']):
            row, col = (ord(well[0]) - ord('A')) + 1, int(well[1:])
            if (row, col) not in annot: # skip wells that are not annotated - these may have .fcs files, but are really empty
                continue
            well_annot = annot[(row, col)]
            orf_name = well_annot[2]
            stat = stats['data'][i]
            if orf_name != "nan":
                if orf_name in self.orf_stats: # if already there
                    self.orf_stats[orf_name].update(well_annot, stat, well, combine_wells=(self.screen == "haploid-R1")) # update cluster statistics
                else:
                    self.orf_stats[orf_name] = cPlateOrf(well_annot, stat, screen, plate, well) # else create the whole thing

            mean_values.append(SP.dot(SP.array(stat.cluster_mean).T, stat.cluster_count)/sum(stat.cluster_count))

        self.mean_values = SP.array(mean_values).mean(axis=0)


    def __str__(self):
        return "Experiment: %s\n"%self.experiment + \
               "Screen    : %s\n"%self.screen + \
               "No. ORFs  : %d\n"%(len(self.orf_stats)) + \
               "Settings  : %s"%self.settings_string


""" Container for information about one biological replicate of one ORF in one plate of one screen. There may be multiple
wells of this plate corresponding to the ORF, but not multiple plates. """
class cPlateOrf:
    def __init__(self, annotation, stats, screen, plate, well):
        self.screen = screen
        self.plate = plate
        self.orf_name = annotation[2]
        self.common_name = annotation[3]
        self.tap_level = SP.nan
        if annotation[4] not in BAD_TAP_SIGNAL:
            self.tap_level = float(annotation[4])
        self.localisation = annotation[5]

        self.wells = [well]
        self.type = {well:annotation[0]}
        self.ploidies = {well:annotation[1]}
        self.stats = {well:stats}

        self.stats[well].cluster_ploidy = [annotation[1]]*len(self.stats[well].cluster_count)
        if 'clusters' in dir(stats):
            stats.cluster_name = stats.clusters
            stats.cluster_count = [1000]*len(stats.clusters)
        for c in range(len(stats.cluster_name)): # add information about well type into clusters
            self.stats[well].cluster_name[c] = "%s | %s"%(self.type[well], stats.cluster_name[c])


    """ Update ORF data for this screen with info from another well """
    def update(self, annotation, stats, well, combine_wells=False):
        if 'clusters' in dir(stats):
            stats.cluster_name = stats.clusters
            stats.cluster_count = [1000]*len(stats.clusters)
        
        if combine_wells: # Special case pretty much for haploid-R1 only
            w_old = self.wells.pop() # if combine, take first key, change it to the combination of two wells
            w_new = ",".join([w_old,well])
            self.wells.append(w_new)
            self.ploidies[w_new] = ",".join(SP.unique([self.ploidies.pop(w_old), annotation[1]])) # update ploidies for the combined well

            # if combining wells, and there is a mixture, set the type to mixture; otherwise keep the type for the new well key
            t_old = self.type.pop(w_old)
            if ((t_old == "BY") and (annotation[0] == "RM")) or ((self.type == "RM") and (annotation[0] == "BY")) or ((self.type == "BY_RM_combined") and (annotation[0] == "BY_RM_combined")):
                self.type[w_new] = "BY_RM_combined"
            else:
                self.type[w_new] = annotation[0]

            # finally, update cluster stats
            st = self.stats.pop(w_old) # get the old entry to update
            for c in range(len(stats.cluster_name)): # combine the new clusters into old
                if (st.cluster_name.count("RFP low") > 0) and (annotation[0] == "BY"): continue  # skipping ones that don't make sense
                if (st.cluster_name.count("RFP high") > 0) and (annotation[0] == "RM"): continue  # (low RFP & BY; high RFP & RM) 

                st.cluster_name.append("%s | %s"%(annotation[0], stats.cluster_name[c]))
                st.cluster_count.append(stats.cluster_count[c])
                st.cluster_mean.append(stats.cluster_mean[c])
                st.cluster_median.append(stats.cluster_median[c])
                st.cluster_var.append(stats.cluster_var[c])
                st.cluster_linearvar.append(stats.cluster_linearvar[c])
                st.cluster_linearmean.append(stats.cluster_linearmean[c])
                st.cluster_linearmedian.append(stats.cluster_linearmedian[c])
                st.cluster_ploidy.append(annotation[1])
            self.stats[w_new] = st # store the new combined entry
            w_update = w_new
        else:
            self.wells.append(well)
            self.type[well] = annotation[0] # if not combining, just add values to map
            self.ploidies[well] = annotation[1]
            self.stats[well] = stats 
            for c in range(len(stats.cluster_name)):
                self.stats[well].cluster_name[c] = "%s | %s"%(annotation[0], self.stats[well].cluster_name[c])


    """ Calculate summary statistics of the orf in this screen """
    def calc_stats(self, all_types):
        n_wells, n_types = len(self.wells), len(all_types)
        l = -1
        for v in self.stats.values():
            if len(v.cluster_mean) > 0: l = max(l, len(v.cluster_mean[0]))
        self.median = SP.zeros([n_wells, n_types, l])*SP.nan
        self.mean = SP.zeros(self.median.shape)*SP.nan
        self.var = SP.zeros(self.median.shape)*SP.nan
        self.linearmedian = SP.zeros(self.median.shape)*SP.nan
        self.linearmean = SP.zeros(self.median.shape)*SP.nan
        self.linearvar = SP.zeros(self.median.shape)*SP.nan
        self.count = SP.zeros(self.median.shape)*SP.nan
        
        for w,well in enumerate(self.wells): # for each measured well
            # assume want RM from RFP low, and BY from RFP high, and the slice
            i_cluster = SP.nan
            st = self.stats[well]
            for t in range(len(all_types)):
                for i in range(len(st.cluster_name)): 
                    type, budding, rfp, fsc = st.cluster_name[i].split("|")
                    is_single_cluster = (len(st.cluster_count) <= 2)
                    p = get_parent(st.cluster_name[i], self.screen, single_slice_cluster=is_single_cluster, plate=self.plate)
                    if (p is None) or (all_types[t] != p): continue # only consider clusters that match the parent we're interested in for this type, and where the cluster makes sense
                    if fsc.count("000") == 0: # slice, not everything
                        i_cluster = i
                        #if (p[0:2] == "RM") and (rfp == " RFP low "):
                        #    i_cluster = i
                        #elif (type == "WT ") and (self.screen == "Hap1"):
                        #    i_rm = i
                        #elif (type == "BY_RM_combined ") and (rfp == " RFP low ") and (all_types[t] == "RM"):
                        #    i_rm = i
                        #elif (type == "BY ") and (rfp == " RFP high "):
                        #    i_by = i
                        #elif (type == "BY ") and (self.screen == "Hap1"):
                        #    i_by = i
                        #elif (type == "BY_RM_combined ") and (rfp == " RFP high "):
                        #    i_by = i

                if not SP.isnan(i_cluster): # and (not SP.isnan(i_rm)): # if found both clusters, calculate stats
                    self.median[w,t,:] = st.cluster_median[i_cluster]
                    self.mean[w,t,:] = st.cluster_mean[i_cluster]
                    self.var[w,t,:] = st.cluster_var[i_cluster]
                    self.linearmedian[w,t,:] = st.cluster_linearmedian[i_cluster]
                    self.linearmean[w,t,:] = st.cluster_linearmean[i_cluster]
                    self.linearvar[w,t,:] = st.cluster_linearvar[i_cluster]
                    self.count[w,t,:] = st.cluster_count[i_cluster]


    def __str__(self):
        return "Screen    : %s\n"%self.screen + \
               "Plate     : %s\n"%self.plate + \
               "ORF       : %s\n"%self.orf_name + \
               "Wells     : %s\n"%(",".join(self.wells)) + \
               "Types     : %s\n"%(",".join(self.types)) + \
               "Ploidies  : %s\n"%(",".join(self.ploidies)) + \
               "Medians   : %s"%(str(self.median.mean(axis=0)))


""" Container for summary statistics about a single well. """
class cWellStat:
    """ Constructor for calculating statistics from given data on all cells in the well, clustering across an entire plate,
    and additional settings for determining suitable cells """ 
    def __init__(self, data, p_bud, p_rfp, threshold=0.9, features=DEFAULT_COLNAMES, fsca_slice=DEFAULT_FSCA_SLICE, debug=False):
        slices = [DEFAULT_FSCA_SLICE, fsca_slice] # cell size slices to perform quantification in
        i_fsca = features.index("FSC-A")
        
        # 0. broad statistics
        self.features = features
        self.cluster_name, self.cluster_count, self.cluster_mean, self.cluster_median, self.cluster_var, self.cluster_linearmean, self.cluster_linearmedian, self.cluster_linearvar = [],[],[],[],[], [],[],[]
        self.n_cells = data.shape[0]
        self.n_budded_cells = sum(p_bud > threshold)
        self.n_rfp = (p_rfp > threshold).sum(axis=0)[0:-1] # all but last cluster of RFP

        # 1. unbudded-only statistics
        Iunbud = SP.where(p_bud > threshold)[0]
        data, p_rfp = data[Iunbud,:], p_rfp[Iunbud,:]        

        # 2. unbudded RFP cluster specific statistics            
        for c in range(p_rfp.shape[1] - 1): # for all RFP clusters that are not noise
            Ir = SP.where(p_rfp[:,c] > threshold)[0] # get cluster members
            if len(Ir) == 0: continue

            # quantify all slices
            for s in slices: # for the default slice and quantification slice of cell size
                I = SP.where((data[Ir,i_fsca] > s[0]) & (data[Ir,i_fsca] < s[1]))[0] # filter on FSC-A
                if len(I) == 0: continue
                d = data[Ir,:][I] # get data in cluster in this slice
                self.cluster_name.append("Unbudded | RFP %s | FSCA in %.2f %.2f"%(["low","high"][c], s[0],s[1]))
                self.cluster_count.append(len(I))
                self.cluster_mean.append(d.mean(axis=0))
                self.cluster_median.append(SP.median(d,axis=0))
                self.cluster_var.append(d.var(axis=0))
                dlin = 10**d
                self.cluster_linearvar.append(dlin.var(axis=0))
                self.cluster_linearmean.append(dlin.mean(axis=0))
                self.cluster_linearmedian.append(SP.median(dlin,axis=0))

        if debug:
            import pdb
            import pylab as PL
            PL.figure()
            cols = 'rb'
            for c in range(p_rfp.shape[1] - 1): # for all RFP clusters that are not noise
                Ir = SP.where(p_rfp[:,c] > threshold)[0] # get cluster members
                if len(Ir) == 0: continue

                    # quantify all slices
                for s in slices[1:]: # for the default slice and quantification slice of cell size
                    I = SP.where((data[Ir,i_fsca] > s[0]) & (data[Ir,i_fsca] < s[1]))[0] # filter on FSC-A
                    if len(I) == 0: continue
                    PL.subplot(311)
                    d = data[Ir,:][I] # get data in cluster in this slice
                    PL.plot(d[:,2],d[:,0], ".", markersize=10, alpha=0.1, color=cols[c])
                    PL.subplot(312)
                    PL.plot(d[:,1],d[:,0], ".", markersize=10, alpha=0.1, color=cols[c])
                    PL.subplot(313)
                    PL.hist(d[:,0], bins=40, color=cols[c], alpha=0.3)
                    print "GFP median between 4.4, 4.6: %.2f"%(SP.median(d[:,0]))
            
            print self.n_cells, self.n_budded_cells
            print self.cluster_name
            print self.cluster_mean
            print self.cluster_median
            print self.cluster_var
            print self.cluster_linearvar
            print self.cluster_linearmean
            print self.cluster_linearmedian
            PL.show()
            pass


    def __str__(self):
        return "# cells       : %d\n"%self.n_cells + \
               "# budded cells: %d\n"%self.n_budded_cells + \
               "# RFP ok cells: %s\n"%(str(self.n_rfp)) + \
               "Cluster names : %s\n"%("   ".join(self.cluster_name)) + \
               "Cluster counts: %s"%(", ".join(map(str, self.cluster_count)))



""" Process all fcs files in plates subdirectories for a given screen from a given experiment
prior is data or fixed. If data, prior for two-way clusters are estimated as means of bottom and top half of the data
n_restarts - number of random restarts for cluster fittings
n_rnd_points - number of datapoints to select from 48 well in the plate to construct clusters
threshold - cell is is assigned to cluster c if p(cell i in cluster c > threshold)
n_rfp_modes - number of main RFP expression level modes in the data (usually 2 for mixed wells, 1 for single query wells)
fsca_slice_size - fraction of the random cells used for calculating summary statistics. The cells are chosen around the median FSC-A level.
"""
def process_screen(screen, cluster_settings, experiment="Pilot_screen_BYxRM", overwrite=False, overwrite_cluster=False, plate_pattern="P*", external_hd=False):
    screen_dir = "%s/cytometry/%s/%s"%(DATA_DIR, experiment, screen)
    stat_dir = "%s/stats_%s"%(screen_dir, str(cluster_settings)) # prepare condensed output storing all used options for well statistics
    if external_hd: screen_dir = "/Volumes/BACKUP/%s/%s"%(experiment, screen)
    os.system("mkdir -p %s"%(stat_dir))

    plates = [d.split("/")[-1] for d in glob.glob("%s/%s"%(screen_dir, plate_pattern))] # get all plates for the screen

    for plate in sorted(plates): # for each plate
        outfile = "%s/stats-%s.pickle"%(stat_dir, plate)
        if os.path.exists(outfile) and (not overwrite): continue
        # 1. create budding and RFP clusters 
        t0 = time.time()
        data, colnames, filters, clustering, fsca_slice = None, DEFAULT_COLNAMES, DEFAULT_FILTERS, None, DEFAULT_FSCA_SLICE
        cluster_file = "%s/cluster-gmm_plate-%s.pickle"%(stat_dir, plate)
        LOG.info("Processing screen %s; plate %s"%(screen, plate))

        if (not os.path.exists(cluster_file)) or overwrite_cluster: # if need to do the clustering
            if cluster_settings.prior == "manual":
                data, colnames, filters = read_plate_fcs_random_data(n_rnd_points=cluster_settings.n_rnd_points, screen=screen, plate=plate, experiment=experiment, n_files=cluster_settings.n_rnd_files, filters="manual", external_hd=external_hd) # get random data from a lot of wells, filter on FSC-A, SSC-W, mCherry-A manually
            else: # data is from prior
                data, colnames, filters = read_plate_fcs_random_data(n_rnd_points=cluster_settings.n_rnd_points, screen=screen, plate=plate, experiment=experiment, n_files=cluster_settings.n_rnd_files, external_hd=external_hd) # get random data from a lot of wells
                
                                                                                                                                                                                        #fsca_sorted = sorted(data[:,colnames.index("FSC-A")]) # create cell size slice, but only from unbudded cell
                                                                                                                                                                                        #fsca_slice = [fsca_sorted[data.shape[0]/2 - int(cluster_settings.n_rnd_points*0.5*cluster_settings.fsca_slice_size)], fsca_sorted[data.shape[0]/2 + int(cluster_settings.n_rnd_points*0.5*cluster_settings.fsca_slice_size)]]
            
        clustering = create_byxrm_plate_clusters_twophase(data, prior=cluster_settings.prior, n_restarts=cluster_settings.n_restarts, n_rnd_points=cluster_settings.n_rnd_points, outfile=cluster_file, overwrite=overwrite_cluster, n_rfp_modes=cluster_settings.n_rfp_modes, filters=filters) # create cluster for budded/unbudded, and RFP BY/RM, and the scaling factors used in clustering; pick initial cluster means manually            
        LOG.info("Done creating clusters, time=%.1f"%(time.time() - t0))

        # 2. calculate statistics for every well in the plate
        t0 = time.time()
        stats = create_plate_stats(screen=screen, plate=plate, experiment=experiment, clustering=clustering, threshold=cluster_settings.threshold, outfile=outfile, colnames=colnames, filters=filters, external_hd=external_hd, fsca_slice=fsca_slice) # and use them to assign cells in each well to one of the clusters, and quantify their GFP, RFP, counts, etc.
        LOG.info("Done quantifying well statistics, time=%.1f"%(time.time() - t0))



""" Create a single summary of cytometry data for each well in the plate
clustering - map of cluster name to GMM object (required to have a match for budding_GMM and rfp_GMMs)
colnames - column names to be extracted from FCS file
threshold - cutoff value for indicator variable to assign a data point to a cluster
outfile - if not None, file to save result to
fsca_slice - filter for FSC-A value (normally, either 10% centered on median, or entire range are used.
filters - list of linear gates (see DEFAULT_FILTERS for example)
min_cells - minimum number of entries in FCS file to be processed
"""
def create_plate_stats(screen, plate, experiment, clustering, colnames=DEFAULT_COLNAMES, threshold=0.9, outfile=None, fsca_slice=DEFAULT_FSCA_SLICE, filters=DEFAULT_FILTERS, min_cells=100, fsca_slice_size=0.1, external_hd=False):
    files = SP.array(glob.glob("%s/%s/%s/%s/*.fcs"%([DATA_DIR + "/cytometry", "/Volumes/BACKUP"][external_hd], experiment, screen, plate)))
    stat_names, wells, stats = [], [], [] # column headers, row names, data
    budding_gmm, rfp_gmms = clustering['budding_GMM'], clustering['rfp_GMMs']

    for f in files: # for each FCS file
        # 0. read, filter, and standardise data
        cols, data = read_fcs(f, colnames_tostore=colnames, log=True)
        data = filter_fcs(data, cols, filters)
        if data.shape[0] < min_cells: continue # require at least 100 cells here by default
        Icol = [cols.index(c) for c in clustering['features']]
        data_std = (data[:,Icol] - clustering['means'])/clustering['sds']
        # 1. cluster data, after standardising it in identical way to the learned clusters
        Icluster = [clustering['features'].index(feature) for feature in DEFAULT_BUDDING_CLUSTER_FEATURES]
        p_bud = budding_gmm.predict_proba(data_std[:,Icluster]) 
        # 2. cluster unbudded cells further based on RFP
        unbudded_cluster = budding_gmm._means[0:2,1].argmin() # unbudded cluster is the one of the first two that has smaller mean SSC-W (first index)
        p_rfp = rfp_gmms[unbudded_cluster].predict_proba(data_std[:, [clustering['features'].index('mCherry-A')]]) # cluster cells based on RFP
        if (rfp_gmms[unbudded_cluster].n_components > 2) and (rfp_gmms[unbudded_cluster]._means[0] > rfp_gmms[unbudded_cluster]._means[1]): # if clusters in wrong order, reorder assignments
            p_rfp = p_rfp[:,[1,0,2]] # switch first two clusters 
        # 3. calculate well statistics based on the clusterings
        if fsca_slice_size is not None:
            Iunbudded = SP.where(p_bud[:,unbudded_cluster] > 0.8)[0]
            fsca_sorted, n = sorted(data[Iunbudded,colnames.index("FSC-A")]), len(Iunbudded) # create cell size slice, but only from unbudded cell
            fsca_slice = [fsca_sorted[n/2 - int(0.5*n*fsca_slice_size)], fsca_sorted[n/2 + int(0.5*n*fsca_slice_size)]]
        stats.append(cWellStat(data, p_bud[:,unbudded_cluster], p_rfp, threshold, cols, fsca_slice))
        wells.append(f.split("_")[-2])

    result = {'wells':wells, 'data':stats}
    if outfile is not None: cdm(result, outfile)
    return result



""" Read collected plate statistics for one plate
@param plate string for plate in the screen (e.g. P1)
@param settings long settings string for clustering settings
@param screen screen string (e.g. haploid-R1)
@param experiment global experiment
@return pickle file contents for this plate with the cluster settings from the screen and experiment
"""
def read_platestats(plate, settings, screen, experiment="Pilot_screen_BYxRM"):
    if settings is None: # if settings not specified, take the first one used in the screen
        settings = glob.glob("%s/cytometry/%s/%s/stats*"%(DATA_DIR, experiment, screen))[0].split("/")[-1] # assume only one set of analyses for this plate; pick it.
    
    return cl("%s/cytometry/%s/%s/%s/stats-%s.pickle"%(DATA_DIR, experiment, screen, str(settings), plate))



""" Read statistics from all plates, combines with annotations
Every ORF will be represented once per plate. Different plates are assumed different biological replicates for ORFs.
Same plate but different wells are assumed technical replicates.
@param screen string for screen name (e.g. haploid-R1)
@param settings long string for clustering settings. If None, the first one available (in output directory) is taken
@param experiment global experiment string
@return cScreenResults object for given screen and settings
"""
def read_screen_results(screen, settings=None, experiment="Pilot_screen_BYxRM"):
    res = {} # map of ORF standard name to summary object
    if settings is None: # if settings not specified, take the first one used in the screen
        settings = glob.glob("%s/cytometry/%s/%s/stats*rfpmodes*"%(DATA_DIR, experiment, screen))[0].split("/")[-1]
    plates = []
    
    for platefile in glob.glob("%s/cytometry/%s/%s/%s/stats*.pickle"%(DATA_DIR, experiment, screen, settings)):
        LOG.debug("Reading screen %s plate file %s"%(screen, platefile))
        plates.append(cScreenPlate(experiment, screen, platefile, settings))

    return cScreenResults(experiment, screen, plates, settings)



""" Take an ORF statistic object (cORFStat), and give the BY and RM expression values in a simple form
@param orfstat cPlateOrf object
@param ploidies list of allowed ploidies
@param parents list of parents in this screen
@return well->slice->parent->[gfp features] + [cluster size]
"""
def get_screen_orf_summary(orfstat, ploidies, parents, debug=False):
    gfp_features = ['mean','median','var', "linearmean", "linearmedian", "linearvar"]
    plate_clusters = {}  # gather data on the clusters (well->slice->parent->[feature value])

    for well in orfstat.stats: # for each well of this orf in the screen
        if orfstat.ploidies[well] not in ploidies: continue # (skipping unwanted ploidies)

        plate_clusters[well] = {'all':{}, 'slice':{}}
        for p in parents:
            plate_clusters[well]['all'][p], plate_clusters[well]['slice'][p] = [],[] # init cluster info for all parents
        stat = orfstat.stats[well] # get stats
        gfp_trait_index = stat.features.index("FITC-A") # get the index of GFP feature
        numdone = 0
        for i,clnm in enumerate(stat.cluster_name):  # look through all clusters deduced from scatter and RFP            
            slice = ['slice', 'all'][clnm.count("0000.00")] # is it derived from a small slice around the median, or all data?
            parent = get_parent(clnm, orfstat.screen, single_slice_cluster=(len(stat.cluster_count) <= 2), plate=orfstat.plate)
            if parent is None:
                #print "No parent from %s"%(clnm)
                continue # if no parent returned, means no sensible one for this cluster (e.g. low RFP, BY cluster) and the cluster should be skipped
            if len(plate_clusters[well][slice][parent]) != 0:
                #print "Already exists:", orfstat.screen, orfstat.plate, well, orfstat.orf_name, slice, parent, clnm
                continue
            numdone += 1
            for feature in gfp_features: # for each feature (median, mean, variance, or count)
                cluster_gfp_feature = eval('stat.cluster_%s[%d]'%(feature, i)) # get the value for it of the cluster
                plate_clusters[well][slice][parent].append(float(cluster_gfp_feature[gfp_trait_index])) # add the measurement
            if debug: pdb.set_trace()
            plate_clusters[well][slice][parent].append(stat.cluster_count[i]) # add cluster size
        if ((orfstat.screen == "haploid-R1") and (well.count(",") == 0) and (numdone == 2)) or (((orfstat.screen != "haploid-R1") or (well.count(",") > 0)) and (numdone == 4)):
            pass
        else:
            LOG.debug("Weird number of done parentals for screen %s, well %s - %d"%(orfstat.screen, well, numdone))
            #pdb.set_trace()
            pass
            
    return plate_clusters                        



""" Take structured screen data, and return a matrix suitable for algebraic manipulations
@param screen_data list of cScreenResults objects
@param allowed_ploidies list of strings that the ploidy of the well must match to be included in the output
@return (metadata, data) Data is Nx14 matrix, where N is the total number of wells. The columns are mean, median, variance, linearmean, linearmedian, linearvar, count (all of slice), mean, median, variance, linearmean, linearmedian, linearvariance, count (all of complete data). Metadata is a Nx7 matrix, with columns experiment, screen, plate, well, ORF, parental background, ploidy.
"""
def create_screen_matrix(screen_data, allowed_ploidies):
    res_meta, res_data = [], []
    
    for screen in screen_data: # for each screen
        parents = SP.unique(" ".join([" ".join([" ".join([x.replace("/"," ") for x in o.type.values()]) for o in orfstats]) for orfstats in screen.orf_stats.values()]).strip().split())
        if screen.screen == "haploid-R1": parents = ["BY", "RM"] # backward compatibility
        LOG.debug("Processing screen %s; ploidies = %s, parents = %s"%(screen.screen, str(allowed_ploidies), str(parents)))
        for orfstats in screen.orf_stats.values(): # for each orf in the screen
            for orfstat in orfstats: # There are potentially many of values in one screen that come from different plates - output all of them
                stats = get_screen_orf_summary(orfstat, allowed_ploidies, parents) # get a clean total summary
                for well in stats: # go through wells and
                    for parent in parents: # parents
                        if parent not in stats[well]['slice']:
                            #print "skipping (no parent) ", screen.screen, well, parent
                            continue                            
                        if len(stats[well]['slice'][parent] + stats[well]['all'][parent]) < 8:
                            #print "skipping (bad length)", screen.screen, well, parent, len(stats[well]['slice'][parent]), len(stats[well]['all'][parent])
                            #if screen.screen == "haploid-R3": pdb.set_trace()
                            continue # if not full data, no point in including the observation
                        if len(stats[well]['slice'][parent]) != 7 or len(stats[well]["all"][parent]) != 7:
                            get_screen_orf_summary(orfstat, allowed_ploidies, parents, debug=True)
                            pass
                        res_meta.append([screen.experiment, screen.screen, orfstat.plate, well, orfstat.orf_name, parent, orfstat.ploidies[well]]) # store data for this parent for wells that have cells from it
                        res_data.append(stats[well]['slice'][parent] + stats[well]["all"][parent])
                        if len(res_data[-1]) > 14:
                            print res_data[-1], well, parent, len(stats[well]['slice'][parent]), len(stats[well]["all"][parent])
                            pdb.set_trace()
                            pass

    return SP.array(res_meta), SP.array(res_data, float)



""" Create a combined matrix of all observations
@param min_cells_slice minimum number of cells observed (within the restrictive slice) in the well to be reported
@param max_gfp_sd maximum log scale variance within the restrictive slice to be reported
@param screens list of string screens to report the results of
@param outfilename If not None, the file to output the results to
@param allowed_ploidies list of strings that the ploidy of the well must match to be included in the output
@param renames map of screen name->screen name for renaming screens. For paper output, putting R3 as R1 (cleanest), R1 + R1 fillin for R2 (two batches, not as clean).
@return (metadata, data) Data is Nx8 matrix, where N is the total number of wells. The columns are mean, median, variance, count (all of slice), mean, median, variance, count (all of complete data). Metadata is a Nx7 matrix, with columns experiment, screen, plate, well, ORF, parental background, ploidy.
"""
def create_combined_mat(min_cells_slice=150, max_gfp_sd=0.35, screens=("haploid-R1", "haploid-R3"), outfilename=None, allowed_ploidies=["Haploid"], renames=None):
    stats = [read_screen_results(s) for s in screens]
    meta, dat = create_screen_matrix(stats, allowed_ploidies)
    if renames is None:  renames = {}
    for n in SP.unique(meta[:,1]):
        if n not in renames: renames[n] = n

    # if desired, output all data into <outfilename>
    if outfilename is not None:
        ofh = file(outfilename, 'w')
        ofh.write("Experiment\tScreen\tPlate\tWell\tOrf\tQuery\tPloidy\tGFP_slice_mean\tGFP_slice_median\tGFP_slice_var\tGFP_slice_linearmean\tGFP_slice_linearmedian\tGFP_slice_linearvar\tGFP_slice_count\tGFP_all_mean\tGFP_all_median\tGFP_all_var\tGFP_all_linearmean\tGFP_all_linearmedian\tGFP_all_linearvar\tGFP_all_count\n")
        # for each ORF, output all its observations
        for orf in SP.unique(meta[:,4]):
            #if orf in ["YMR288W","YGL201C"]: pdb.set_trace()
            I = SP.where(meta[:,4] == orf)[0]
            for i in I:
                ofh.write("%s\t%s\t"%(meta[i][0], renames[meta[i][1]]))
                ofh.write("%s\t%s\t%s\t%s\t%s"%(tuple(meta[i][2:])))
                ofh.write("\t%.3f\t%.3f\t%.4f\t%.1f\t%.1f\t%.3e\t%d\t%.3f\t%.3f\t%.4f\t%.1f\t%.1f\t%.3e\t%d\n"%(tuple(dat[i])))
        ofh.close()
        
    return meta, dat


