# python libraries
import os
import sys
import getopt
import shutil
import logging
from collections import Counter, defaultdict
from scipy.stats.mstats import mquantiles
from pyfaidx import Fasta

from ..align.models import CloudModel
from ..align.candcloud import CloudFeat_t

from ..common import util
from ..common import plotter

from batch import Worker

logger = logging.getLogger(__name__)

#-------------------------------------------------------------------------
# estimate parameters from a few chosen wells
#-------------------------------------------------------------------------
class Estimator(Worker):

  def run(self):
  
    wellStats_path = self.config.wellStats_path
    params_path = self.config.params_path

    # load cloud features
    #-------------
    # use about 10 non empty wells
    cloudFeats_list = []
    loaded = 0
    statsFnames_list = filter(
      lambda(f): f.endswith('.stats.p'),
      os.listdir(wellStats_path),
    )
    for fname in statsFnames_list:
      path = os.path.join(wellStats_path, fname)
      statsFrag_list = util.loadPickle(path)
      if len(statsFrag_list) < 10:
        continue
      cloudFeats_list.extend(statsFrag_list)
      loaded += 1
      if loaded >= 6:
        break

    # filter outlier clouds
    #-------------
    numReads = map(lambda(cf): cf.numReads, cloudFeats_list)
    sizes    = map(lambda(cf): cf.size, cloudFeats_list)
  
    sizequants = mquantiles(sizes, prob=[0.95])
    numreadquants = mquantiles(numReads, prob=[0.95])
    cloudFeats_list = filter(
      lambda(cf): (
        cf.size < sizequants[0] and
        cf.numReads < numreadquants[0]
      ),
      cloudFeats_list,
    )
  
    logger.info('number of starting clouds {0}'.format(len(cloudFeats_list)))
    logger.info('  - {0} passed outlier size, numreads filter'.format(
      len(cloudFeats_list)))
  
    # save cloud features to pickle
    cfPickle_path = os.path.join(
      wellStats_path,
      'train-cloudfeats.p',
    )
    util.writePickle(cfPickle_path, cloudFeats_list)
  
    # train model on these features
    logger.info('training cloud model')
    model = CloudModel()
    model.train(cloudFeats_list)
  
    # save model to pickle
    logger.info('dumping parameters')
    params_map = {
      'cloudModel' : model.getPickleData(),
    }
    util.writePickle(params_path, params_map)

    self.__stats__(cfPickle_path)
  
  def __stats__(self, cfPickle_path):
    cloudFeats_list = util.loadPickle(cfPickle_path)
  
    nobar__cf = filter(lambda(cf): not cf.lbarValid and not cf.rbarValid, cloudFeats_list)
    pbar__cf  = filter(lambda(cf): cf.lbarValid ^ cf.rbarValid, cloudFeats_list)
    valid__cf = filter(lambda(cf): cf.lbarValid and cf.rbarValid, cloudFeats_list)

    logger.info('total no end-marker'.format(len(nobar__cf)))
    logger.info('total partial end-marker'.format(len(pbar__cf)))
    logger.info('total valid end-marker'.format(len(valid__cf)))
  
    fragCoverage = sum(map(lambda(cf): cf.size, cloudFeats_list))
    srCoverage = 100 * sum(map(lambda(cf): cf.numReads, cloudFeats_list))
  
    logger.info('total long frag coverage {0}'.format(fragCoverage))
    logger.info('total short read coverage {0}'.format(srCoverage))
    logger.info('little x {0}'.format(1. * srCoverage / fragCoverage))
  
    logger.info('creating plots')
    plotsDir_path = 'plots'
    self.deliver_list.append(plotsDir_path)
    util.mkdir_p(plotsDir_path)
    with util.cd(plotsDir_path):
      nobar__sizes = map(lambda(cf): cf.size, nobar__cf)
      pbar__sizes = map(lambda(cf): cf.size, pbar__cf)
      valid__sizes = map(lambda(cf): cf.size, valid__cf)
      plotter.plotHistograms(
        [
          ('nobar', nobar__sizes),
          ('pbar', pbar__sizes),
          ('valid', valid__sizes),
        ],
        {
          'xlabel' : 'sizes',
          'ylabel' : 'freq',
          'fname'  : 'size_plot',
          'numBins': 20,
        }
      )

