/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2005) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////
//
// \file NormalizeFlowVec.cc   

#ifndef FORCE_DEBUG
     #define NDEBUG
#endif

#include "454/NormalizeFlowVec.h"

#include "CoreTools.h"
#include "math/Functions.h"
#include "TokenizeString.h"


bool
NormalizeFlowVec(vec<double> & query_flow_vec,
		 const FlowOrder &order,
		 const String &key)
{
  FlowStats stats;
  return 0 == FlowVecNormalizer(order, key)(query_flow_vec, stats);
}

FlowVecNormalizer::FlowVecNormalizer(const FlowOrder &order,
				     const String &keyString) :
  lowmeanMultiplier(2.5),
  lowHighmeanMultiplier(0.35), highHighmeanMultiplier(1.45),
  maxConsecNoiseFlowThreshold(12), minConsecNoiseFlowThreshold(1),
  noiseFlowThreshold(0.5), highNoiseThreshold(0.1),
  key(order, keyString),
  firstreadbase(0)
{
  for (int i=0; i<key.F().size(); ++i) {
    if (key.F()[i]==1.f) {
      highbases.push_back(i);
    } else {
      lowbases.push_back(i);
    }
  }
  ForceAssert(!highbases.empty());
  firstreadbase = highbases.back();
  highbases.pop_back();
  PRINT(highbases);
  while (!lowbases.empty() && lowbases.back()> firstreadbase)
    lowbases.pop_back();
  ForceAssert(!lowbases.empty());
  PRINT(lowbases);
}

int
FlowVecNormalizer::operator()(vec<double> & query_flow_vec,
			      FlowStats &stats)
{
  // returns positive reason code if the normalization fails
  // (in which case the flow vector is probably junk)
  // returns 0 for success

  const int num_query_flows = query_flow_vec.size();
  const int keyoffset = key.Offset();
  int i;
  stats.clear();

  // find the mean of the low values
  for (i=0; i<lowbases.isize(); ++i) {
    stats.lowmean += query_flow_vec[lowbases[i]];
  }
  stats.lowmean /= double(lowbases.size());

  // subtract multiple of lowmean from all the values, and zero any negative
  // values
  for (i = 0; i < num_query_flows; i++)
    {
      query_flow_vec[i] -= stats.lowmean * lowmeanMultiplier;
      if (query_flow_vec[i] < 0.0)
	{
	  query_flow_vec[i] = 0.0;
	}
    }

  // now find the mean of the high values
  for (i=0; i<highbases.isize(); ++i) {
    stats.highmean += query_flow_vec[highbases[i]];
  }
  stats.highmean /= double(highbases.size());

  if (stats.highmean < 0.1)
    return 1; // 1 = failed because key didn't match expected key

  // look at the first half of the flowgram except the key flows,
  // and pick those flow signals whose values lie between
  // 0.35 and 1.45 of  highmean;
  // these are provisionally the flow values of a single incorporated base;
  // compute the average of these values

  stats.sum_unitvals = 0.0;
  stats.num_unitvals = 0;

  for (i = firstreadbase; i < num_query_flows / 2; i++)
    {
      if ( query_flow_vec[i] > lowHighmeanMultiplier * stats.highmean &&
	   query_flow_vec[i] < highHighmeanMultiplier * stats.highmean )
	{
	  stats.sum_unitvals += query_flow_vec[i];
	  stats.num_unitvals++;
	}
    }

  if (stats.num_unitvals == 0)
    return 2; // failed because didn't find any monomers in search range

  stats.avg_unitval = stats.sum_unitvals / static_cast<double>(stats.num_unitvals);

  // use  avg_unitval  as the unit to normalize  query_flow_vec
  for (i = 0; i < num_query_flows; i++)
    {
      query_flow_vec[i] /= stats.avg_unitval;
    }

  // compute the average noise flow value
  // and the maximum number of consecutive noise flows
  stats.sum_noise_values = 0.0;
  stats.num_noise_flow = 0;

  stats.max_consec_noise_flows = 0;
  unsigned int num_consec_noise_flows = 0;
  Bool prev_flow_is_noise = False;

  for (i = firstreadbase; i < num_query_flows-1; i++)
    {
      const double flow_val = query_flow_vec[i];

      if (flow_val < noiseFlowThreshold) 
	{
	  // this is a noise flow value

	  stats.sum_noise_values += flow_val;
	  stats.num_noise_flow++;

	  if (prev_flow_is_noise)
	    {
	      num_consec_noise_flows++;
	    }
	  prev_flow_is_noise = True;
	}
      else
	{
	  prev_flow_is_noise = False;
	  if (stats.max_consec_noise_flows < num_consec_noise_flows)
	    {
	      stats.max_consec_noise_flows = num_consec_noise_flows;
	    }
	  num_consec_noise_flows = 0;
	}
    }
  if (prev_flow_is_noise &&
      stats.max_consec_noise_flows < num_consec_noise_flows)
    {
      stats.max_consec_noise_flows = num_consec_noise_flows;
    }

  // reject the flow vector
  //    if the maximum length of consecutive noise flows is too high
  // or if it is 0 -- which means that all flows are signal (unlikely)

  const bool unlikelyNoiseFlows =
    (stats.max_consec_noise_flows < minConsecNoiseFlowThreshold ||
     stats.max_consec_noise_flows > maxConsecNoiseFlowThreshold);

  // reject the flow vector if the average noise value is too high
  stats.avg_noise_value
    = stats.sum_noise_values / static_cast<double>(stats.num_noise_flow);

  const bool highNoise = (stats.avg_noise_value > highNoiseThreshold);

  if (unlikelyNoiseFlows && highNoise)
    return 5; // failed both high-noise and unlike # noise flows tests
  if (unlikelyNoiseFlows)
    return 3; // failed because # noise flows found is unlikely 
  if (highNoise)
    return 4; // failed because average noise value is too high

  // the flow vector looks good, so accept it
  return 0;
}

ostream & operator<<(ostream &out, const FlowStats &s)
{
  out << s.lowmean << "\t" << s.highmean << "\t"
      << s.sum_unitvals << "\t" << s.num_unitvals << "\t" 
      << s.avg_unitval << "\t"
      << s.sum_noise_values  << "\t"
      << s.num_noise_flow  << "\t" << s.max_consec_noise_flows << "\t"
      << s.avg_noise_value;
  return out;
}
