
/******************************************************************************
 *
 *  This file is part of canu, a software program that assembles whole-genome
 *  sequencing reads into contigs.
 *
 *  This software is based on:
 *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
 *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
 *
 *  Except as indicated otherwise, this is a 'United States Government Work',
 *  and is released in the public domain.
 *
 *  File 'README.licenses' in the root directory of this distribution
 *  contains full conditions and disclaimers.
 */

#include "system.H"
#include "intervals.H"
#include "math.H"

#include "AS_BAT_ReadInfo.H"
#include "AS_BAT_BestOverlapGraph.H"
#include "AS_BAT_Logging.H"

#include "AS_BAT_Unitig.H"


//  Compute a score for an overlap.
//
//  For containments, it's just the quality of the alignment, as the
//  number of bases is the same.
//
//  For dovetails, it's the length of the overlap, with ties broken by the
//  quality of the overlap.

static
uint64
scoreOverlap(BAToverlap& olap) {
  uint64  leng = 0;
  uint64  rate = AS_MAX_EVALUE - olap.evalue;

  assert(olap.evalue <= AS_MAX_EVALUE);
  assert(rate        <= AS_MAX_EVALUE);

  if (olap.isDovetail() == true) {
    if (olap.a_hang > 0)
      leng = RI->readLength(olap.a_iid) - olap.a_hang;
    else
      leng = RI->readLength(olap.a_iid) + olap.b_hang;
  }

  leng <<= AS_MAX_EVALUE_BITS;

  return(leng | rate);
}



//
//
//



void
BestOverlapGraph::findInitialEdges(void) {

  _nReadsEP[0] = _nReadsEP[1] = 0;   //  Number of reads with an edge possible.   [0] == graph limit
  _nReadsEF[0] = _nReadsEF[1] = 0;   //  Number of reads with an edge found.      [1] == max   limit

  //  Find initial contained reads and edges using the preferred erateGraph.
  //  Then count the number of reads that could have an edge, and the number
  //  that do have at least one edge.

  _errorLimit = _erateGraph;

  findContains();
  findEdges(true);

  for (uint32 fi=1; fi <= RI->numReads(); fi++) {
    if (!RI->isValid(fi) || isContained(fi) || isIgnored(fi))
      continue;

    _nReadsEP[0]++;

    if (getBestEdgeOverlap(fi, false)->isUnset() == false ||
        getBestEdgeOverlap(fi,  true)->isUnset() == false)
      _nReadsEF[0]++;
  }

  //  If we've found a sufficint number of reads with edges, we're done here.

  if (_nReadsEF[0] >= _minReadsBest * _nReadsEP[0])
    return;

  //  Otherwise, increase the limit to the max and recompute.

  _errorLimit = _erateMax;

  findContains();
  findEdges(true);

  for (uint32 fi=1; fi <= RI->numReads(); fi++) {
    if (!RI->isValid(fi) || isContained(fi) || isIgnored(fi))
      continue;

    _nReadsEP[1]++;

    if (getBestEdgeOverlap(fi, false)->isUnset() == false ||
        getBestEdgeOverlap(fi,  true)->isUnset() == false)
      _nReadsEF[1]++;
  }
}



bool
BestOverlapGraph::summarizeBestEdges(double errorLimit, double p, uint32 nFiltered[4]) {
  uint32  fiLimit    = RI->numReads();

  nFiltered[0] = 0;   //  Both edges below limit
  nFiltered[1] = 0;   //  One edge above limit
  nFiltered[2] = 0;   //  Both edges above limit
  nFiltered[3] = 0;   //  Total number of reads with an edge

  for (uint32 fi=1; fi <= fiLimit; fi++) {
    BestEdgeOverlap *b5 = getBestEdgeOverlap(fi, false);
    BestEdgeOverlap *b3 = getBestEdgeOverlap(fi, true);

    if ((b5->isUnset() != true) ||    //  Read has at least one best edge.
        (b3->isUnset() != true)) {
      uint32  nf = ((b5->erate() > errorLimit) +
                    (b3->erate() > errorLimit));
      nFiltered[nf]++;
    }
  }

  nFiltered[3] = nFiltered[0] + nFiltered[1] + nFiltered[2];

  return(nFiltered[0] >= p * nFiltered[3]);
}



void
BestOverlapGraph::findErrorRateThreshold(FILE *report) {
  uint32  fiLimit    = RI->numReads();
  uint32  numThreads = getNumThreads();
  uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

  //  Find and remember the erate for every best edge.  No way around this,
  //  since we need to find the median value.
  //
  //  If there are no best edges, find the overlap with the most matches and
  //  use that.  This shouldn't happen anymore.

  std::vector<double>  erates;

  for (uint32 fi=1; fi <= fiLimit; fi++) {
    BestEdgeOverlap *b5 = getBestEdgeOverlap(fi, false);
    BestEdgeOverlap *b3 = getBestEdgeOverlap(fi, true);

    if (isIgnored(fi) == true)
      continue;

    //  If best edges, save the error rate of them.

    if (b5->isValid() == true)   erates.push_back(b5->erate());
    if (b3->isValid() == true)   erates.push_back(b3->erate());

    //  If no best edges, search for the overlap with the highest number of
    //  matches and use that.  Only filtering on error rate, because that's
    //  all we have computed so far.

    if ((b5->isUnset() == true) &&
        (b3->isUnset() == true)) {
      uint32      no    = 0;
      BAToverlap *ovl   = OC->getOverlaps(fi, no);
      BAToverlap best;

      for (uint32 oo=0; oo<no; oo++) {
        if (isOverlapBadQuality(ovl[oo]) == true)
          continue;

        if (best.b_iid == 0 || OC->compareOverlaps(best, ovl[oo])) {
           best = ovl[oo];
        }
      }

      if (best.b_iid != 0)
         erates.push_back(best.erate());
    }
  }

  std::sort(erates.begin(), erates.end());

  //  Find mean/stddev (with an online calculation) and the median/mad.

  stdDev<double>  edgeStats;

  for (uint32 ii=0; ii<erates.size(); ii++)
    edgeStats.insert(erates[ii]);

  double mean   = edgeStats.mean();
  double stddev = edgeStats.stddev();

  double median = 0.0;
  double mad    = 0.0;

  computeMedianAbsoluteDeviation(erates, median, mad, true);

  //  Pick an error threshold.
  //
  //  The tight threshold is the minimum of:
  //    the user supplied threshold (-eg or -eM as decided above)
  //    the median + mad (if the median isn't zero)
  //    the 90th percentile error rate (if the median is zero)
  //
  //  The loose threshold is the minimum of:
  //    the user supplied threshold (-eg or -eM as decided above)
  //    the mean + stddev
  //
  //  Note that there is no guarantee that TpickedTight <= TpickedLoose!

  double   Tmax    = _erateMax;
  double   Tgraph  = _erateGraph;
  double   Tforced = _erateForced;
  double   Tmean   = mean   + _deviationGraph          * stddev;
  double   Tmad    = median + _deviationGraph * 1.4826 * mad;
  uint32    pos    = (uint32)((erates.size()+1) * _percentileError);
  double   Tperct  = erates[pos] + 1e-5;

  assert((_erateGraph == _errorLimit) ||   //  Either erateGraph or erateMax, as
         (_erateMax   == _errorLimit));    //  set in findInitialEdges().

  double   TpickedTight = std::min(_errorLimit, ((median > 1e-10) ? Tmad : Tperct));
  double   TpickedLoose = std::min(_errorLimit, Tmean);
  double   Tpicked      = 0.0;

  //  Summarize the number of reads with edges.  If there are too few reads
  //  that have both ends with valid edges, increase the picked error
  //  threshold to the mean.

  uint32  nFilteredTight[4] = {0};
  uint32  nFilteredLoose[4] = {0};

  bool  tightGood = summarizeBestEdges(TpickedTight, _minReadsBest, nFilteredTight);   //  Good if more than 90% of the
  bool  looseGood = summarizeBestEdges(TpickedLoose, _minReadsBest, nFilteredLoose);   //  reads with edges have two edges.

  //  Finally, decide on the tight or loose bound based on the number of reads with two best edges,
  //  If the tight bound isn't good and it's less that the loose bound, use the loose.
  //  Otherwise, the tight is good or we'd make it worse by picking the loose.

  if ((tightGood == false) && (TpickedTight < TpickedLoose))
    Tpicked = TpickedLoose;
  else
    Tpicked = TpickedTight;

  if (Tforced < 1.0)
    Tpicked = Tforced;

  _errorLimit = Tpicked;

  //  Now emit a lovely log.

  fprintf(report, "\n");
  fprintf(report, "ERROR RATES\n");
  fprintf(report, "-----------\n");
  fprintf(report, "                                                 --------threshold------\n");
  fprintf(report, "%-12u"     "                 fraction error      fraction        percent\n", edgeStats.size());
  fprintf(report, "samples                              (1e-5)         error          error\n");
  fprintf(report, "                 --------------------------      --------       --------\n");
  fprintf(report, "command line (-eg)                           ->  %8.2f      %8.4f%%%s\n",                                              1e5 * Tgraph,  100.0 * Tgraph,  (_errorLimit == Tgraph)  ? "  (enabled)" : "");
  if (Tforced < 1.0)
  fprintf(report, "command line (-ef)                           ->  %8.2f      %8.4f%%%s\n",                                              1e5 * Tforced, 100.0 * Tforced, (_errorLimit == Tforced) ? "  (enabled)" : "");
  else
  fprintf(report, "command line (-ef)                           ->  -----.--      ---.----%%\n");
  fprintf(report, "command line (-eM)                           ->  %8.2f      %8.4f%%%s\n",                                              1e5 * Tmax,    100.0 * Tmax,    (_errorLimit == Tmax)    ? "  (enabled)" : "");
  fprintf(report, "mean + std.dev   %8.2f +- %3.0f * %8.2f"  "  ->  %8.2f      %8.4f%%%s\n", 1e5 * mean,   _deviationGraph, 1e5 * stddev, 1e5 * Tmean,   100.0 * Tmean,   (_errorLimit == Tmean)   ? "  (enabled)" : "");
  fprintf(report, "median + mad     %8.2f +- %3.0f * %8.2f"  "  ->  %8.2f      %8.4f%%%s\n", 1e5 * median, _deviationGraph, 1e5 * mad,    1e5 * Tmad,    100.0 * Tmad,    (_errorLimit == Tmad)    ? "  (enabled)" : "");
  fprintf(report, "90th percentile                              ->  %8.2f      %8.4f%%%s\n",                                              1e5 * Tperct,  100.0 * Tperct,  (_errorLimit == Tperct)  ? "  (enabled)" : "");
  fprintf(report, "\n");
  fprintf(report, "BEST EDGE FILTERING\n");
  fprintf(report, "-------------------\n");
  fprintf(report, "At graph threshold %.4f%%, reads:\n", 100.0 * _erateGraph);
  fprintf(report, "  available to have edges:    %9u\n", _nReadsEP[0]);
  fprintf(report, "  with at least one edge:     %9u\n", _nReadsEF[0]);
  fprintf(report, "\n");
  fprintf(report, "At max threshold %.4f%%, reads:%s\n", 100.0 * _erateMax, (_nReadsEP[1] == 0) ? "  (not computed)" : "");
  fprintf(report, "  available to have edges:    %9u\n", _nReadsEP[1]);
  fprintf(report, "  with at least one edge:     %9u\n", _nReadsEF[1]);
  fprintf(report, "\n");
  fprintf(report, "At tight threshold %.4f%%, reads with:\n", 100.0 * TpickedTight);
  fprintf(report, "  both edges below error threshold: %9u  (%5.2f%% minReadsBest threshold = %u)\n", nFilteredTight[0], 100.0 * _minReadsBest, (uint32)(_minReadsBest * nFilteredTight[3]));
  fprintf(report, "  one  edge  above error threshold: %9u\n",                                        nFilteredTight[1]);
  fprintf(report, "  both edges above error threshold: %9u\n",                                        nFilteredTight[2]);
  fprintf(report, "  at least one edge:                %9u\n",                                        nFilteredTight[3]);
  fprintf(report, "\n");
  fprintf(report, "At loose threshold %.4f%%, reads with:\n", 100.0 * TpickedLoose);
  fprintf(report, "  both edges below error threshold: %9u  (%5.2f%% minReadsBest threshold = %u)\n", nFilteredLoose[0], 100.0 * _minReadsBest, (uint32)(_minReadsBest * nFilteredLoose[3]));
  fprintf(report, "  one  edge  above error threshold: %9u\n",                                        nFilteredLoose[1]);
  fprintf(report, "  both edges above error threshold: %9u\n",                                        nFilteredLoose[2]);
  fprintf(report, "  at least one edge:                %9u\n",                                        nFilteredLoose[3]);
  fprintf(report, "\n");
}



//
//
//



static
void
logCovGapRead(FILE *F, uint32 fi, intervalList<int32> &IL, bool dove5, bool dove3) {
  char    log[1024];
  uint32  lp = 0;

  sprintf(log+lp, "%-7u  %s %s  %3d  %7d-%-7d",
          fi,
          (dove5) ? "yes" : "no ",
          (dove3) ? "yes" : "no ",
          IL.numberOfIntervals(),
          IL.lo(0),
          IL.hi(0));

  while ((lp < 1024) && (log[lp] != 0))
    lp++;

  for (uint32 ii=1; ((lp < 1000) && (ii < IL.numberOfIntervals())); ii++) {
    sprintf(log+lp, " %7d-%-7d", IL.lo(ii), IL.hi(ii));

    while ((lp < 1024) && (log[lp] != 0))
      lp++;
  }

  log[lp++] = '\n';
  log[lp++] =  0;

#pragma omp critical (covGapPrint)
  fputs(log, F);
}



void
BestOverlapGraph::removeReadsWithCoverageGap(const char *prefix, covgapType ct, uint32 covGapOlap) {
  uint32  fiLimit    = RI->numReads();
  uint32  numThreads = getNumThreads();
  uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

  uint8  *CG = new uint8 [fiLimit + 1];

  for (uint32 fi=0; fi <= fiLimit; fi++)
    CG[fi] = 0;

  FILE   *F = merylutil::openOutputFile(prefix, '.', "best.coverageGap", logFileFlagSet(LOG_BEST_EDGES));

  if (F) {
    fprintf(F, "         covered\n");
    fprintf(F, "readID     5' 3'  num  covered regions\n");
    fprintf(F, "-------  --- ---  ---  --------------------\n");
  }

  //  Search for reads that have an internal region with no coverage.
  //  If found, flag these as _coverageGap.

#pragma omp parallel for schedule(dynamic, blockSize)
  for (uint32 fi=1; fi <= fiLimit; fi++) {
    uint32               fLen = RI->readLength(fi);
    uint32               no   = 0;
    BAToverlap          *ovl  = OC->getOverlaps(fi, no);
    intervalList<int32>  IL;
    bool                 dove5    = false;
    bool                 dove3    = false;
    bool                 isCovGap = false;

    //  Add an interval for each 'good' overlap.  Since this is really the
    //  first filter that would mark a read as 'junk', the test is simply
    //  overlap error.

    for (uint32 ii=0; ii<no; ii++) {
      if (isOverlapBadQuality(ovl[ii]) == true)
        continue;

      uint32   bgn =        ((ovl[ii].a_hang <= 0) ? 0 :  ovl[ii].a_hang);
      uint32   end = fLen - ((ovl[ii].b_hang >= 0) ? 0 : -ovl[ii].b_hang);

      assert(bgn <= end);
      assert(end <= fLen);

      //  I debated a bit if (bgn == 0) was correct, or if the hang needed to
      //  be non-zero, for this to be called a 'dovetail overhang'.  The
      //  isContained flag should be filtering out reads that are contained
      //  in us:
      //      -------------------- (us)
      //      -----------          (them)
      //  so the other case is that (them) must be bigger than (us), e.g.,
      //  we're a contained read anyway and so don't reallty care about the
      //  distinction between an end covered exactly by another read vs an
      //  end covered by another read that extends us.
      //
      //  So it's sufficient to just test (end == 0 && not-contained) to
      //  decide if this read is covered by a read that will allow our path
      //  to get out of this read.

      if ((bgn == 0)    && (isContained(ovl[ii].b_iid) == false))   dove5 = true;
      if ((end == fLen) && (isContained(ovl[ii].b_iid) == false))   dove3 = true;

      IL.add(bgn, end - bgn);
    }

    //  Squish overlapping invervals together.

    IL.merge(covGapOlap);

    //  If no intervals at all, it's a singleton, and we don't care much what
    //  happens.  The logic is a bit easier if we just bail on it though.

    if (IL.numberOfIntervals() == 0)
      continue;

    //  Test if those intervals indicate a read we want to ignore.

    if ((ct == covgapChimer) &&             //  It's a simple chimer if there
        (IL.numberOfIntervals() > 1))       //  is more than one interval.
      isCovGap = true;

    if ((ct == covgapUncovered) &&          //  It's an uncovered read if there is more
        ((IL.numberOfIntervals() > 1) ||    //  than one interval or (implicitly one
         (IL.lo(0) != 0) ||                 //  interval and) either end is not covered.
         (IL.hi(0) != fLen)))
      isCovGap = true;

    if ((ct == covgapDeadend) &&            //  It's an uncovered read with no path out if
        ((IL.numberOfIntervals() > 1) ||    //  more than one interval or (implicitly one
         (dove5 == false) ||                //  interval and) either end isn't covered by
         (dove3 == false)))                 //  a dovetail read.
      isCovGap = true;

    //  If a covGap log and flag it.

    if (isCovGap == true) {
      if (F)
        logCovGapRead(F, fi, IL, dove5, dove3);

      CG[fi] = 1;
    }
  }

  merylutil::closeFile(F, prefix, '.', "best.coverageGap");

  //  Finally, set all the coverage gap marks.  This is done last to prevent
  //  a race when isOverlapBadQuality() ignores a coverage gap read.

  for (uint32 fi=1; fi <= fiLimit; fi++)
    if (CG[fi])
      setCoverageGap(fi);   //  Now it's a coverage gap read.

  delete [] CG;
}



//
//
//
//  Compare the size of our edges (this5 and this3) against the edges back
//  from the reads those point to.
//
//  ------------------------->
//                         |
//                       this3
//                         v
//            <--back3-- -------------------->
//
//  If back5 points back to us, our 3' end is good.
//  If back5 is of a comparable size to this3, our 3' end is good.
//
//
//



void
BestOverlapGraph::removeLopsidedEdges(const char *prefix, const char *label, double lopsidedDiff) {
  uint32  fiLimit    = RI->numReads();
  uint32  numThreads = getNumThreads();
  uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

  FILE  *LOP = merylutil::openOutputFile(prefix, '.', label, logFileFlagSet(LOG_BEST_EDGES));

#pragma omp parallel for schedule(dynamic, blockSize)
  for (uint32 fi=1; fi <= fiLimit; fi++) {

    if ((isIgnored(fi)     == true) ||   //  Skip ignored and contain, because we don't care
        (isContained(fi)   == true) ||   //  about best edges to/from these.  And skip covGap
        (isCoverageGap(fi) == true))     //  because they're already flagged as crap.
      continue;

    if (isLopsided(fi) == true)        //  In the second pass, skip lopsided.  They're already
      continue;                        //  flagged, and we won't undo it.

    if (isSpur(fi) == true)            //  In the second pass, skip spurs.  We don't care
      continue;                        //  if they're lopsided.

    BestEdgeOverlap *this5 = getBestEdgeOverlap(fi, false);
    BestEdgeOverlap *this3 = getBestEdgeOverlap(fi, true);

    //  Find the length of the overlap on either end.

    int32   this5ovlLen = RI->overlapLength(fi, this5->readId(), this5->ahang(), this5->bhang());
    int32   this3ovlLen = RI->overlapLength(fi, this3->readId(), this3->ahang(), this3->bhang());

    //  Find the edges from our best edges.  Note that 'back5' is NOT the
    //  edge out of the 5' end of that read; it is the edge that should point
    //  back to our 5' end.

    BestEdgeOverlap *back5 = getBestEdgeOverlap(this5->readId(), this5->read3p());
    BestEdgeOverlap *back3 = getBestEdgeOverlap(this3->readId(), this3->read3p());

    //  If both point back to us, we're done.  These must be symmetric, else
    //  overlapper is bonkers.

    if ((back5->readId() == fi) && (back5->read3p() == false) &&
        (back3->readId() == fi) && (back3->read3p() == true))
      continue;

    //  Complain loudly if we have a best overlap to a spur.  Why doesn't the
    //  read we have an edge to have a edge out of it?!
    //
    //  Unfortunately, this fails on the reduced graph.

#if 1
    if ((this5->isValid() == true) && (back5->isUnset() == true))
      writeStatus("WARNING: read %u 5' has overlap to spur read %u %c'!\n",
               fi, this5->readId(), this5->read3p() ? '3' : '5');

    if ((this3->isValid() == true) && (back3->isUnset() == true))
      writeStatus("WARNING: read %u 3' has overlap to spur read %u %c'!\n",
               fi, this3->readId(), this3->read3p() ? '3' : '5');

    assert((this5->isUnset() == true) || (back5->isValid() == true));
    assert((this3->isUnset() == true) || (back3->isValid() == true));
#endif

    //  Compute the length of those best overlaps...

    int32  back5ovlLen = RI->overlapLength(this5->readId(), back5->readId(), back5->ahang(), back5->bhang());
    int32  back3ovlLen = RI->overlapLength(this3->readId(), back3->readId(), back3->ahang(), back3->bhang());

    //  ...and compare.  The read will be lopsided if the edge out of us is
    //  less than 25% of the length of the comparable edge out of our
    //  neighbor.

    double  score5 = (back5ovlLen == 0) ? (100.0) : (100.0 * this5ovlLen / back5ovlLen);
    double  score3 = (back3ovlLen == 0) ? (100.0) : (100.0 * this3ovlLen / back3ovlLen);

    //  Not lopsided if the overlap is thick relative to the length of this read.
#if 0
    if (this5ovlLen >= 0.333 * RI->readLength(fi))   score5 = 100.0;
    if (this3ovlLen >= 0.333 * RI->readLength(fi))   score3 = 100.0;
#endif

    //  Declare lopsided if the score is low.

    setLopsided5(fi, (score5 < lopsidedDiff));
    setLopsided3(fi, (score3 < lopsidedDiff));

    if (LOP) {
      char  loplog[1024] = {0};

      if (isLopsided(fi) == true) {
        if     ((this5->readId() > 0) &&
                (this3->readId() > 0))
          sprintf(loplog, "lopsidedBest %8u -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%% -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%%\n",
                  fi,
                  this5->readId(), this5->read3p() ? '3' : '5', this5ovlLen, back5->readId(), back5->read3p() ? '3' : '5', back5ovlLen, score5,
                  this3->readId(), this3->read3p() ? '3' : '5', this3ovlLen, back3->readId(), back3->read3p() ? '3' : '5', back3ovlLen, score3);

        else if (this5->readId() > 0)
          sprintf(loplog, "lopsidedBest %8u -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%% --\n",
                  fi,
                  this5->readId(), this5->read3p() ? '3' : '5', this5ovlLen, back5->readId(), back5->read3p() ? '3' : '5', back5ovlLen, score5);

        else if (this3->readId() > 0)
          sprintf(loplog, "lopsidedBest %8u --                                                            -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%%\n",
                  fi,
                  this3->readId(), this3->read3p() ? '3' : '5', this3ovlLen, back3->readId(), back3->read3p() ? '3' : '5', back3ovlLen, score3);
      }

      //  Waaaaay too much logging to report all the non-lopsided reads.  Unless you really care.
      //else
      //  sprintf(loplog, "fi %8u -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%% -- %8u/%c' len %6u VS %8u/%c' len %6u %8.4f%% -- ACCEPTED\n",
      //          fi,
      //          this5->readId(), this5->read3p() ? '3' : '5', this5ovlLen, back5->readId(), back5->read3p() ? '3' : '5', back5ovlLen, score5,
      //          this3->readId(), this3->read3p() ? '3' : '5', this3ovlLen, back3->readId(), back3->read3p() ? '3' : '5', back3ovlLen, score3);

      if (loplog[0] != 0)
#pragma omp critical (fprintf_LOP)
        fputs(loplog, LOP);
    }
  }

  merylutil::closeFile(LOP);

  //  Remove overlaps to or from lopsided reads.  We'll let the parent find
  //  new edges, as needed.

  for (uint32 fi=1; fi <= fiLimit; fi++) {
    BestEdgeOverlap *this5 = getBestEdgeOverlap(fi, false);
    BestEdgeOverlap *this3 = getBestEdgeOverlap(fi, true);

    if ((isLopsided(fi) == true) || (isLopsided(this5->readId()) == true))
      *this5 = BestEdgeOverlap();

    if ((isLopsided(fi) == true) || (isLopsided(this3->readId()) == true))
      *this3 = BestEdgeOverlap();
  }
}



//
//
//
//  Find reads that terminate at a spur after some short traversal.
//
//    A read is marked SPUR if there is no edge out of it on either end.
//
//    A read is marked SPURPATH if the path out of the read on that end
//    terminates (after some distance) at a spur read.
//
//  Note that SPUR reads have at least one SPURPATH end flagged.
//
//
//



uint32
BestOverlapGraph::spurDistance(BestEdgeOverlap *edge, uint32 limit, uint32 distance) {
  uint32  inid = edge->readId();
  bool    in3p = edge->read3p();
  bool    ot3p = (in3p == false) ? true : false;

  assert(isIgnored(inid)     == false);   //  Edge to ignored read, ERROR!
  assert(isCoverageGap(inid) == false);   //  Edge to covgap read, ERROR!

  if (inid == 0)                          //  If no edge, we've terminated early,
    return(distance);                     //  so we've followed a path to a spur.

  if (distance == limit)                  //  If we've hit the limit, not a spur,
    return(UINT32_MAX);                   //  return infinity.

  //  Otherwise, follow the path.

  return(spurDistance(getBestEdgeOverlap(inid, ot3p), limit, distance+1));
}



void
BestOverlapGraph::removeSpannedSpurs(const char *prefix, uint32 spurDepth) {
  uint32  fiLimit    = RI->numReads();
  uint32  numThreads = getNumThreads();
  uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

  std::set<uint32>  spurpath5;   //  spurpath is true if the edge out of this end
  std::set<uint32>  spurpath3;   //           leads to a dead-end spur.
  std::set<uint32>  spur;        //  spur     is true if this read is a spur.

  //  Compute the distance to a dead end.
  //    If zero, flag the read as a spur.
  //    If small, flag that read end as leading to a spur end.

  for (uint32 fi=1; fi <= fiLimit; fi++) {
    if ((RI->isValid(fi)   == false) ||   //  Unused read, ignore.
        (isIgnored(fi)     == true)  ||   //  Ignored read, ignore.
        (isContained(fi)   == true)  ||   //  Contained read, ignore.
        (isCoverageGap(fi) == true)  ||   //  Chimeric covGap read, ignore.
        (isLopsided(fi)    == true))      //  Lopsided read, ignore.
      continue;

    uint32  dist5 = spurDistance(getBestEdgeOverlap(fi, false), spurDepth);
    uint32  dist3 = spurDistance(getBestEdgeOverlap(fi,  true), spurDepth);

#if 0
    if (dist5 == 0)           {  writeLog("read %u 5' is a terminal spur\n", fi);  spur.insert(fi);       }
    if (dist3 == 0)           {  writeLog("read %u 3' is a termainl spur\n", fi);  spur.insert(fi);       }
    if (dist5  < spurDepth)   {  writeLog("read %u 5' is a spur path\n", fi);      spurpath5.insert(fi);  }
    if (dist3  < spurDepth)   {  writeLog("read %u 3' is a spur path\n", fi);      spurpath3.insert(fi);  }
#endif

    if (dist5 == 0)           spur.insert(fi);
    if (dist3 == 0)           spur.insert(fi);
    if (dist5  < spurDepth)   spurpath5.insert(fi);
    if (dist3  < spurDepth)   spurpath3.insert(fi);
  }

  //
  //  Do several iterations of spur path mark removal.
  //

  writeStatus("BestOverlapGraph()--   Initial       %8u terminal  spur reads - %8u/%-8u 5'/3' spur path reads.\n",
              spur.size(), spurpath5.size(), spurpath3.size());

  uint32  n5pChanged = 1;
  uint32  n3pChanged = 1;

  for (uint32 iter=1; ((n5pChanged + n3pChanged > 0) && (iter <= 2 * spurDepth)); iter++) {
    n5pChanged = 0;
    n3pChanged = 0;

    //
    //  For any read that can get out on a non-spur-path end, mark the
    //  incoming edge as not a spur path:
    //
    //    If the 5' path out of me is not a spur path, then the
    //    edge INTO my 3' end is not a spur path.
    //

    for (uint32 fi=1; fi <= fiLimit; fi++) {
      if ((RI->isValid(fi)   == false) ||   //  Unused read, ignore.
          (isIgnored(fi)     == true)  ||   //  Ignored read, ignore.
          (isContained(fi)   == true)  ||   //  Contained read, ignore.
          (isCoverageGap(fi) == true)  ||   //  Chimeric covGap read, ignore.
          (isLopsided(fi)    == true))      //  Lopsided read, ignore.
        continue;

      BestEdgeOverlap  *edge5 = getBestEdgeOverlap(fi, false);
      BestEdgeOverlap  *edge3 = getBestEdgeOverlap(fi,  true);

      bool              sp5  = (spurpath5.count(fi) > 0);
      bool              sp3  = (spurpath3.count(fi) > 0);

      bool              sp53 = (spurpath5.count(edge3->readId()) > 0);
      bool              sp33 = (spurpath3.count(edge3->readId()) > 0);
      bool              sp55 = (spurpath5.count(edge5->readId()) > 0);
      bool              sp35 = (spurpath3.count(edge5->readId()) > 0);

      //  Logging, only if enabled, and only if the spur path is actually removed.

#if 0
      if ((sp5 == false) && (edge3->read3p() == false) && (sp53 == true))   writeLog("SPUR path from read %u 5' removed\n", edge3->readId());
      if ((sp5 == false) && (edge3->read3p() ==  true) && (sp33 == true))   writeLog("SPUR path from read %u 3' removed\n", edge3->readId());
      if ((sp3 == false) && (edge5->read3p() == false) && (sp55 == true))   writeLog("SPUR path from read %u 5' removed\n", edge5->readId());
      if ((sp3 == false) && (edge5->read3p() ==  true) && (sp35 == true))   writeLog("SPUR path from read %u 3' removed\n", edge5->readId());
#endif

      //  Remove spur-path marks.

      if ((sp5 == false) && (edge3->read3p() == false) && (sp53 == true))   spurpath5.erase(edge3->readId());
      if ((sp5 == false) && (edge3->read3p() ==  true) && (sp33 == true))   spurpath3.erase(edge3->readId());
      if ((sp3 == false) && (edge5->read3p() == false) && (sp55 == true))   spurpath5.erase(edge5->readId());
      if ((sp3 == false) && (edge5->read3p() ==  true) && (sp35 == true))   spurpath3.erase(edge5->readId());
    }

    //
    //  Compute new edges that don't point to a spur or spurpath, unless
    //  those are the only edges it has.
    //
    //  1) Over all overlaps for this read,
    //     ignore crappy edges, and edges to
    //     contains, etc.                        --------->
    //                                                  |
    //  2) Score the edge if the exit from the          v         (score this edge if
    //     read it points to is not a spur path.      -------->    this end isn't a spur-path)
    //
    //     Note that spur reads are also spur-path reads.
    //
    //
    //  3) After all overlaps are processed, if
    //     a read still has no best edge, restore
    //     the previous best edge -- this will be
    //     an edge to a spur or spur-path read, but
    //     it's the ONLY path we have.
    //

    memset(_best5score, 0, sizeof(uint64) * (fiLimit + 1));     //  Clear all edge scores.
    memset(_best3score, 0, sizeof(uint64) * (fiLimit + 1));     //  Clear all edge scores.

    for (uint32 fi=1; fi <= fiLimit; fi++) {
      if ((RI->isValid(fi)   == false) ||   //  Unused read, ignore.
          (isIgnored(fi)     == true)  ||   //  Ignored read, ignore.
          (isContained(fi)   == true)  ||   //  Contained read, ignore.
          (isCoverageGap(fi) == true))      //  Chimeric covGap read, ignore.
        continue;

      uint32      no  = 0;
      BAToverlap *ovl = OC->getOverlaps(fi, no);

      BestEdgeOverlap  prev5 = *getBestEdgeOverlap(fi, false);    //  Remember the previous best edges.
      BestEdgeOverlap  prev3 = *getBestEdgeOverlap(fi,  true);

      getBestEdgeOverlap(fi, false)->clear();                     //  Then clear them.
      getBestEdgeOverlap(fi,  true)->clear();

      for (uint32 ii=0; ii<no; ii++) {                            //  Over all overlaps for this read,
        if ((ovl[ii].isDovetail()         == false) ||            //    Ignore non-dovetail and crappy overlaps.
            (isOverlapBadQuality(ovl[ii]) == true))               //    They can't form best edges.
          continue;

        if ((isIgnored(ovl[ii].b_iid)     == true) ||             //    Ignore overlaps to ignored reads.
            (isContained(ovl[ii].b_iid)   == true) ||             //    Ignore overlaps to contained and chimeric reads.
            (RI->isValid(fi)              == false))              //    Ignore overlaps to reads that don't exist.
          continue;

        if (isCoverageGap(ovl[ii].b_iid)  == true)                //    No edges to coverage gap reads are allowed.
          continue;

        if ((isLopsided(ovl[ii].a_iid)   == true) ||              //    No edges to or from lopsided reads.
            (isLopsided(ovl[ii].b_iid)   == true)) {
          assert(0);
          continue;
        }

        //if (isLopsided(ovl[ii].b_iid)     == true)              //    Allow edges to bubble reads.
        //  continue;

        bool   Aend5 = ovl[ii].AEndIs5prime();   //  The overlap must extend off either the 5' or 3' end for it to be
        bool   Aend3 = ovl[ii].AEndIs3prime();   //  a valid BestEdge.  We're not contained, so only one can be true

        if (Aend5 == true)   assert(Aend3 == false);
        if (Aend3 == true)   assert(Aend5 == false);

        bool   Bend5 = ovl[ii].BEndIs5prime();   //  The overlap can go into either end of the B read.
        bool   Bend3 = ovl[ii].BEndIs3prime();

        bool   sp5c  = (spurpath5.count(ovl[ii].b_iid) > 0);
        bool   sp3c  = (spurpath3.count(ovl[ii].b_iid) > 0);

        //  Log the edges we are skipping, if enabled.  This isn't as useful
        //  as you'd think, since it catches EVERY edge in the path to a
        //  spur, not just the best.

#if 0
        if ((Aend5 == true) && (Bend5 ==  true) && (sp3c == true))   writeLog("edge from %u 5' to %u 5' ignored; outgoing 3' edge leads to a spur\n", ovl[ii].a_iid, ovl[ii].b_iid);
        if ((Aend5 == true) && (Bend5 == false) && (sp5c == true))   writeLog("edge from %u 5' to %u 3' ignored; outgoing 5' edge leads to a spur\n", ovl[ii].a_iid, ovl[ii].b_iid);

        if ((Aend3 == true) && (Bend5 ==  true) && (sp3c == true))   writeLog("edge from %u 3' to %u 5' ignored; outgoing 3' edge leads to a spur\n", ovl[ii].a_iid, ovl[ii].b_iid);
        if ((Aend3 == true) && (Bend5 == false) && (sp5c == true))   writeLog("edge from %u 3' to %u 3' ignored; outgoing 5' edge leads to a spur\n", ovl[ii].a_iid, ovl[ii].b_iid);
#endif

        //  Score the edge.

        if ((Aend5 == true) && (Bend5 ==  true) && (sp3c == false))   scoreEdge(ovl[ii], true, true);
        if ((Aend5 == true) && (Bend5 == false) && (sp5c == false))   scoreEdge(ovl[ii], true, true);

        if ((Aend3 == true) && (Bend5 ==  true) && (sp3c == false))   scoreEdge(ovl[ii], true, true);
        if ((Aend3 == true) && (Bend5 == false) && (sp5c == false))   scoreEdge(ovl[ii], true, true);
      }

      //  All edges scored.  If the new edge is significantly worse than the
      //  spur edge, reset to the spur edge.  But there was no clear signal
      //  here to filter on and the attempt was abandoned.
      //
      //  This was added in af5a174204b37a65ca6e9281cc3e116424c62185.

      //  All edges scored.  If no best edge found, log and reset to whatever
      //  previous edge was there.

#if 0
      if (getBestEdgeOverlap(fi, false)->isUnset() == true)   writeLog("restore edge out of %u 5' to previous best %u %c'\n", fi, prev5.readId(), prev5.read3p() ? '3' : '5');
      if (getBestEdgeOverlap(fi,  true)->isUnset() == true)   writeLog("restore edge out of %u 3' to previous best %u %c'\n", fi, prev3.readId(), prev3.read3p() ? '3' : '5');
#endif

      if (getBestEdgeOverlap(fi, false)->isUnset() == true)   *getBestEdgeOverlap(fi, false) = prev5;
      if (getBestEdgeOverlap(fi,  true)->isUnset() == true)   *getBestEdgeOverlap(fi,  true) = prev3;

      //  Count the number of edges we switched from following a spur path to a normal path.
      //  These are just the edges that are different than their previous version.

      if (*getBestEdgeOverlap(fi, false) != prev5)  n5pChanged++;
      if (*getBestEdgeOverlap(fi,  true) != prev3)  n3pChanged++;
    }

    //
    //  Done with an interation.  Report a bit of status.
    //

    writeStatus("BestOverlapGraph()--   Iteration %1u - %8u terminal  spur reads - %8u/%-8u 5'/3' spur path reads - %8u/%-8u edges changed to avoid spur path.\n",
                iter, spur.size(), spurpath5.size(), spurpath3.size(), n5pChanged, n3pChanged);
  }

  //
  //  Set the spur flag for any spur reads, and log the result.
  //

  FILE   *F = merylutil::openOutputFile(prefix, '.', "best.spurs", logFileFlagSet(LOG_BEST_EDGES));

  for (uint32 fi=1; fi <= fiLimit; fi++) {
    bool s5 = (spurpath5.count(fi) > 0);
    bool s3 = (spurpath3.count(fi) > 0);

    if ((s5 == false) && (s3 == false))  //  No spur mark, so not a spur.
      continue;

    //  Set the spur flag for this read.

    setSpur(fi);

    //  Log it.  It's 'terminal' if there is no edge out of that end.

    bool t5 = (getBestEdgeOverlap(fi, false)->isUnset() == true);
    bool t3 = (getBestEdgeOverlap(fi,  true)->isUnset() == true);

    if ((s5 == true) && (F))
      fprintf(F, "%u 5' is a %s spur end\n", fi, (t5) ? "terminal" : "non-terminal");

    if ((s3 == true) && (F))
      fprintf(F, "%u 3' is a %s spur end\n", fi, (t3) ? "terminal" : "non-terminal");
  }

  merylutil::closeFile(F);

  writeStatus("BestOverlapGraph()--   Final         %8u confirmed spur reads - %8u/%-8u 5'/3' spur path reads.\n",
              numSpur(), spurpath5.size(), spurpath3.size());

  //
  //  Remove edges from spur reads to good reads.
  //  This prevents spurs from breaking contigs.
  //

  for (uint32 fi=1; fi <= fiLimit; fi++) {
    bool ss = (spur.count(fi) > 0);
    bool s5 = (spurpath5.count(fi) > 0);
    bool s3 = (spurpath3.count(fi) > 0);

    if ((ss == false) &&   //  Read fi isn't a spur, and both
        (s5 == false) &&   //  ends aren't leading to a spur;
        (s3 == false))     //  keep all edges intact.
      continue;

    //  Read fi is either a spur or a spur-path read.

    //  Remove edges from this read to any non-spur read, unless that
    //  non-spur read itself has an edge back to us.  If it does, then, by
    //  construction, this is the only place that read can go, and so our
    //  edge to it is valid.

    //  If our 5' edge points to a non-spur read, delete the edge unless it
    //  points back to us.
    {
      BestEdgeOverlap  *edge5 = getBestEdgeOverlap(fi, false);
      uint32            read5 = edge5->readId();
      bool              spur5 = ((spur.count(read5) + spurpath5.count(read5) + spurpath3.count(read5)) > 0);

      if (spur5 == false) {
        BestEdgeOverlap  *backedge = getBestEdgeOverlap(read5, edge5->read3p());

        if ((backedge->readId() != fi) ||
            (backedge->read3p() != false)) {
          //if (read5 != 0)
          //  writeLog("DELETE edge from spur %u 5' to non-spur %u %c'\n", fi, read5, edge5->read3p() ? '3' : '5');
          edge5->clear();
        }
      }
    }

    //  If our 3' edge points to a non-spur read, delete the edge unless it
    //  points back to us.
    {
      BestEdgeOverlap  *edge3 = getBestEdgeOverlap(fi, true);
      uint32            read3 = edge3->readId();
      bool              spur3 = ((spur.count(read3) + spurpath5.count(read3) + spurpath3.count(read3)) > 0);

      if (spur3 == false) {
        BestEdgeOverlap  *backedge = getBestEdgeOverlap(read3, edge3->read3p());

        if ((backedge->readId() != fi) ||
            (backedge->read3p() != true)) {
          //if (read3 != 0)
          //  writeLog("DELETE edge from spur %u 3' to non-spur %u %c'\n", fi, read3, edge3->read3p() ? '3' : '5');
          edge3->clear();
        }
      }
    }
  }
}



//
//
//



bool
BestOverlapGraph::isOverlapBadQuality(BAToverlap& olap) const {
  bool   isBadE = false;
  bool   isIgnV = false;
  bool   isIgnI = false;
  bool   isBadL = false;
  bool   isBadG = false;
  bool   isBadS = false;
  bool   isBadO = false;

  if (olap.erate() > _errorLimit)                  //  Our only real test is on
    isBadE = true;                                 //  overlap error rate.

  if ((RI->isValid(olap.a_iid) == false) ||        //  But if either read is not valid,
      (RI->isValid(olap.b_iid) == false))          //  the overlap is bad.  This should
    isIgnV = true;                                 //  never occur.

  if ((isIgnored(olap.a_iid) == true) ||           //  But if either read is ignored,
      (isIgnored(olap.b_iid) == true))             //  the overlap is also bad.
    isIgnI = true;

  if ((isLopsided(olap.a_iid) == true) ||          //  Bad if the edge touches a
      (isLopsided(olap.b_iid) == true))            //  lopsided read.
    isBadL = true;

  if ((isCoverageGap(olap.a_iid) == true) ||       //  Bad if the edge touches a
      (isCoverageGap(olap.b_iid) == true))         //  coveragegap read.
    isBadG = true;

  if ((isSpur(olap.a_iid) == true) ||              //  Bad if the edge touches a
      (isSpur(olap.b_iid) == true))                //  spurpath read.
    isBadS = true;

  if (_minOlapPercent > 0.0) {
    uint32  lenA = RI->readLength(olap.a_iid);     //  But retract goodness if the
    uint32  lenB = RI->readLength(olap.b_iid);     //  length of the overlap relative
    uint32  oLen = RI->overlapLength(olap.a_iid,   //  to the reads involved is
                                     olap.b_iid,   //  short.  These shouldn't be best
                                     olap.a_hang,  //  but do show up as false best in
                                     olap.b_hang); //  lopsided reads.

    if ((oLen < lenA * _minOlapPercent) ||
        (oLen < lenB * _minOlapPercent))
      isBadO = true;
  }

  olap.filtered = ((isBadE == true) ||           //  The overlap is filtered out ("bad")
                   (isIgnV == true) ||
                   (isIgnI == true) ||
                   (isBadL == true) ||
                   (isBadG == true) ||
                   (isBadS == true) ||
                   (isBadO == true));

  //  Now just a bunch of logging.

  if ((logFileFlagSet(LOG_OVERLAP_SCORING)) &&   //  Write the log only if enabled and the read
      ((olap.a_iid != 0) ||                      //  is specifically annoying (the default is to
       (olap.a_iid == 8675309) ||                //  log for all reads; modify as needed).
       (olap.a_iid == 12345)))
    writeLog("isOverlapBadQuality()-- %6d %6d %c  hangs %6d %6d err %.5f -- %c%c%c%c%c%c%c\n",
             olap.a_iid, olap.b_iid,
             olap.flipped ? 'A' : 'N',
             olap.a_hang,
             olap.b_hang,
             olap.erate(),
             (isBadE) ? 't' : 'f',
             (isIgnV) ? 't' : 'f',
             (isIgnI) ? 't' : 'f',
             (isBadL) ? 't' : 'f',
             (isBadG) ? 't' : 'f',
             (isBadS) ? 't' : 'f',
             (isBadO) ? 't' : 'f');

  return(olap.filtered);
}



inline
void
logEdgeScore(BAToverlap   &olap,
             const char   *message) {
  if ((logFileFlagSet(LOG_OVERLAP_SCORING)) &&    //  Report logging only if enabled, and only
      ((olap.a_iid != 0) ||                       //  for specific annoying reads.  (By default,
       (olap.a_iid == 97202) ||                   //  report for all reads; modify as needed).
       (olap.a_iid == 30701)))
    writeLog("scoreEdge()-- %6d %c' to %6d %c' -- hangs %6d %6d err %8.6f -- %s\n",
             olap.a_iid, olap.AEndIs3prime() ? '3' : '5',
             olap.b_iid, olap.BEndIs3prime() ? '3' : '5',
             olap.a_hang,
             olap.b_hang,
             olap.erate(),
             message);
}



void
BestOverlapGraph::scoreEdge(BAToverlap& olap, bool c5, bool c3) {

  assert(isIgnored(olap.a_iid)   == false);     //  It's an error to call this function
  assert(isContained(olap.a_iid) == false);     //  on ignored or contained reads.

  if ((olap.isDovetail()         == false) ||   //  Ignore non-dovetail overlaps.
      (isContained(olap.b_iid)   == true))      //  Ignore edges into contained.
    return;

  if (isCoverageGap(olap.b_iid)  == true)       //  Ignore edges into coverage gap reads.
    return;                                     //  Suspected to be bad, why go there?

  //if (isLopsided(olap.b_iid)     == true)     //  Explicitly do NOT ignore edges into lopsided.
  //  return;                                   //  Known to be very bad if we do.

  if (isIgnored(olap.b_iid) == true) {          //  Ignore ignored reads.  This could
    logEdgeScore(olap, "ignored");              //  happen; it's just easier to filter
    return;                                     //  them out here.
  }

  if (isOverlapBadQuality(olap) == true) {      //  Ignore the overlap if it is
    logEdgeScore(olap, "bad quality");          //  bad quality.
    return;
  }

  //  Compute the score for this overlap, and remember this overlap if the
  //  score is the best.

  uint64           newScr = scoreOverlap(olap);
  bool             a3p    = olap.AEndIs3prime();
  BestEdgeOverlap *best   = getBestEdgeOverlap(olap.a_iid, a3p);
  uint64          &score  = (a3p) ? (_best3score[olap.a_iid]) : (_best5score[olap.a_iid]);

  assert(newScr > 0);

  //  Skip scoring if we're not scoring this end.

  if ((c5 == false) && (a3p == false))   return;
  if ((c3 == false) && (a3p ==  true))   return;

  //  Otherwise, finally, update the best edge if this one is better.  And log.

  if (newScr <= score) {
    logEdgeScore(olap, "worse");
  }

  else {
    logEdgeScore(olap, "BEST");
    best->set(olap);
    score = newScr;
  }
}



void
BestOverlapGraph::findContains(void) {
  uint32  fiLimit    = RI->numReads();
  uint32  numThreads = getNumThreads();
  uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

  //  Remove containment flags.

  for (uint32 fi=1; fi <= fiLimit; fi++)
    setContained(fi, false);

  //  Check all overlaps and flag any reads that are in a containment
  //  relationship.

#pragma omp parallel for schedule(dynamic, blockSize)
  for (uint32 fi=1; fi <= fiLimit; fi++) {
    uint32      no  = 0;
    BAToverlap *ovl = OC->getOverlaps(fi, no);

    if ((isIgnored(fi)     == true) ||
        (isCoverageGap(fi) == true))
      continue;

    for (uint32 ii=0; ii<no; ii++) {
      if (isOverlapBadQuality(ovl[ii]))      //  Ignore crappy overlaps.
        continue;

      if ((ovl[ii].a_hang == 0) &&           //  If an exact overlap, make
          (ovl[ii].b_hang == 0) &&           //  the lower ID be contained.
          (ovl[ii].a_iid > ovl[ii].b_iid))   //  (Ignore if exact and this
        continue;                            //   ID is larger.)

      if ((ovl[ii].a_hang > 0) ||            //  Ignore if A is not
          (ovl[ii].b_hang < 0))              //  contained in B.
        continue;

      setContained(ovl[ii].a_iid);
    }
  }
}



void
BestOverlapGraph::findEdges(bool redoAll) {
  uint32  fiLimit    = RI->numReads();
  uint32  numThreads = getNumThreads();
  uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

  //  Reset our scores.

  memset(_best5score, 0, sizeof(uint64) * (fiLimit + 1));
  memset(_best3score, 0, sizeof(uint64) * (fiLimit + 1));

  //  Clear all edges if we're recomputing all.

  if (redoAll == true) {
    for (uint32 fi=1; fi <= fiLimit; fi++) {
      _reads[fi]._best5.clear();
      _reads[fi]._best3.clear();
    }
  }

  //  For each read, score every overlap, remembering which is the best for each end.
  //  Only reads without existing best overlaps are computed, unless redoAll is set.

#pragma omp parallel for schedule(dynamic, blockSize)
  for (uint32 fi=1; fi <= fiLimit; fi++) {
    if ((isIgnored(fi)     == true) ||         //  Ignore ignored reads.
        (isContained(fi)   == true) ||         //  Ignore contained reads.
        (isCoverageGap(fi) == true) ||         //  Ignore covGap reads; they're just garbage.
        (isSpur(fi)        == true) ||         //  Ignore spur reads; they have carefully set edges.
        (isLopsided(fi)    == true))           //  Ignore lopsided; they have no edges, period.
      continue;

    bool        c5 = _reads[fi]._best5.isUnset();   //  These change during scoreEdge(), and
    bool        c3 = _reads[fi]._best3.isUnset();   //  we need to remember the original value.

    uint32      no  = 0;
    BAToverlap *ovl = OC->getOverlaps(fi, no);

    for (uint32 ii=0; ii<no; ii++)                  //  Compute scores for all overlaps
      scoreEdge(ovl[ii], c5, c3);                   //  and remember the best.
  }
}



void
BestOverlapGraph::reportBestEdges(const char *prefix, const char *label) {
  char  N[FILENAME_MAX];

  //  Open output files.

  snprintf(N, FILENAME_MAX, "%s.%s.edges",     prefix, label);   FILE *BE = merylutil::openOutputFile(N);
  snprintf(N, FILENAME_MAX, "%s.%s.edges.gfa", prefix, label);   FILE *BG = merylutil::openOutputFile(N);

  //  Write best edges, flagging singleton, coverageGap, lopsided, etc.

  if (BE) {
    fprintf(BE, "readID   libID flags      5' edge M   length    eRate      5' edge M   length    eRate\n");
    fprintf(BE, "-------- ----- -----  ----------- - -------- --------  ----------- - -------- --------\n");

    for (uint32 id=1; id<RI->numReads() + 1; id++) {
      BestEdgeOverlap *e5 = getBestEdgeOverlap(id, false);
      BestEdgeOverlap *e3 = getBestEdgeOverlap(id, true);

      BestEdgeOverlap  *e5back = getBestEdgeOverlap(e5->readId(), e5->read3p());   //  Get edges that are
      BestEdgeOverlap  *e3back = getBestEdgeOverlap(e3->readId(), e3->read3p());   //  potentially back to us.

      bool  e5e = (e5->isUnset() == true) ? false : true;
      bool  e3e = (e3->isUnset() == true) ? false : true;

      double e5err = AS_OVS_decodeEvalue(e5->evalue());
      double e3err = AS_OVS_decodeEvalue(e3->evalue());

      uint32 e5len = RI->overlapLength(id, e5->readId(), e5->ahang(), e5->bhang());
      uint32 e3len = RI->overlapLength(id, e3->readId(), e3->ahang(), e3->bhang());

      char  e5mutual = ((e5back->readId() == id) && (e5back->read3p() == false)) ? 'M' : '-';
      char  e3mutual = ((e3back->readId() == id) && (e3back->read3p() ==  true)) ? 'M' : '-';

      if (RI->readLength(id) == 0)
        continue;

      if      ((e5e == false) && (e3e == false)) {
        fprintf(BE, "%-8u %5u %c%c%c%c%c  -------- -- - -------- --------  -------- -- - -------- --------\n",
                id,
                RI->libraryIID(id),
                (isContained(id))   ? 'C' : '-',
                (isIgnored(id))     ? 'I' : '-',
                (isCoverageGap(id)) ? 'G' : '-',
                (isLopsided(id))    ? 'L' : '-',
                (isSpur(id))        ? 'S' : '-');
      }

      else if ((e5e == false) && (e3e ==  true)) {
        fprintf(BE, "%-8u %5u %c%c%c%c%c  -------- -- - -------- --------  %8u %c' %c %8u %8.6f\n",
                id,
                RI->libraryIID(id),
                (isContained(id))   ? 'C' : '-',
                (isIgnored(id))     ? 'I' : '-',
                (isCoverageGap(id)) ? 'G' : '-',
                (isLopsided(id))    ? 'L' : '-',
                (isSpur(id))        ? 'S' : '-',
                e3->readId(), e3->read3p() ? '3' : '5', e3mutual, e3len, e3err);
      }

      else if ((e5e ==  true) && (e3e == false)) {
        fprintf(BE, "%-8u %5u %c%c%c%c%c  %8u %c' %c %8u %8.6f  -------- -- - -------- --------\n",
                id,
                RI->libraryIID(id),
                (isContained(id))   ? 'C' : '-',
                (isIgnored(id))     ? 'I' : '-',
                (isCoverageGap(id)) ? 'G' : '-',
                (isLopsided(id))    ? 'L' : '-',
                (isSpur(id))        ? 'S' : '-',
                e5->readId(), e5->read3p() ? '3' : '5', e5mutual, e5len, e5err);
      }

      else if ((e5e ==  true) && (e3e ==  true)) {
        fprintf(BE, "%-8u %5u %c%c%c%c%c  %8u %c' %c %8u %8.6f  %8u %c' %c %8u %8.6f\n",
                id,
                RI->libraryIID(id),
                (isContained(id))   ? 'C' : '-',
                (isIgnored(id))     ? 'I' : '-',
                (isCoverageGap(id)) ? 'G' : '-',
                (isLopsided(id))    ? 'L' : '-',
                (isSpur(id))        ? 'S' : '-',
                e5->readId(), e5->read3p() ? '3' : '5', e5mutual, e5len, e5err,
                e3->readId(), e3->read3p() ? '3' : '5', e3mutual, e3len, e3err);
      }

      else {
        assert(0);
      }
    }
  }

  //  Write best edge graph.

  if (BG) {
    fprintf(BG, "H\tVN:Z:1.0\n");

    //  First, write the sequences used.  The sequence can be used as either
    //  a source node or a destination node (or both).

    std::set<uint32>  used;

    for (uint32 id=1; id<RI->numReads() + 1; id++) {
      BestEdgeOverlap *bestedge5 = getBestEdgeOverlap(id, false);
      BestEdgeOverlap *bestedge3 = getBestEdgeOverlap(id, true);

      if ((bestedge5->isUnset() == true) &&   //  Ignore singletons.
          (bestedge3->isUnset() == true) &&
          (isContained(id) == false))
        continue;

      if (isContained(id) == true)        //  Ignore contained reads.
        continue;

      //  Remember the source and destination of this edge.
      used.insert(id);
      used.insert(bestedge5->readId());
      used.insert(bestedge3->readId());
    }

    for (auto it=used.begin(); it != used.end(); it++)
      if (*it != 0)
        fprintf(BG, "S\tread%08u\t*\tLN:i:%u\n", *it, RI->readLength(*it));

    //  Now, report edges.  GFA wants edges in exactly this format:
    //
    //       -------------
    //             -------------
    //
    //  with read orientation given by +/-.  Conveniently, this is what we've saved (for the edges).

    for (uint32 id=1; id<RI->numReads() + 1; id++) {
      BestEdgeOverlap *bestedge5 = getBestEdgeOverlap(id, false);
      BestEdgeOverlap *bestedge3 = getBestEdgeOverlap(id, true);

      if ((bestedge5->isUnset() == true) &&   //  Ignore singletons.
          (bestedge3->isUnset() == true) &&
          (isContained(id) == false))
        continue;

      if (isContained(id) == true)        //  Ignore contained reads.
        continue;

      if (bestedge5->isValid() == true) {
        int32  ahang   = bestedge5->ahang();
        int32  bhang   = bestedge5->bhang();
        int32  olaplen = RI->overlapLength(id, bestedge5->readId(), bestedge5->ahang(), bestedge5->bhang());

        if ((ahang > 0) || (bhang > 0))
          fprintf(stderr, "BAD 5' overlap from read %u to read %u %c': hangs %d %d\n",
                  id,
                  bestedge3->readId(),
                  bestedge3->read3p() ? '3' : '5',
                  bestedge3->ahang(), bestedge3->bhang());
        assert((ahang <= 0) && (bhang <= 0));  //  ALL 5' edges should be this.

        fprintf(BG, "L\tread%08u\t-\tread%08u\t%c\t%uM\n",
                id,
                bestedge5->readId(), bestedge5->read3p() ? '-' : '+',
                olaplen);
      }

      if (bestedge3->isValid() == true) {
        int32  ahang   = bestedge3->ahang();
        int32  bhang   = bestedge3->bhang();
        int32  olaplen = RI->overlapLength(id, bestedge3->readId(), bestedge3->ahang(), bestedge3->bhang());

        if ((ahang < 0) || (bhang < 0))
          fprintf(stderr, "BAD 3' overlap from read %u to read %u %c': hangs %d %d\n",
                  id,
                  bestedge3->readId(),
                  bestedge3->read3p() ? '3' : '5',
                  bestedge3->ahang(), bestedge3->bhang());
        assert((ahang >= 0) && (bhang >= 0));  //  ALL 3' edges should be this.

        fprintf(BG, "L\tread%08u\t+\tread%08u\t%c\t%uM\n",
                id,
                bestedge3->readId(), bestedge3->read3p() ? '-' : '+',
                RI->overlapLength(id, bestedge3->readId(), bestedge3->ahang(), bestedge3->bhang()));
      }
    }
  }

  //  Close all the files.

  merylutil::closeFile(BE);
  merylutil::closeFile(BG);
}



void
BestOverlapGraph::reportEdgeStatistics(FILE *report, char const *label) {
  uint32  fiLimit      = RI->numReads();
  uint32  numThreads   = getNumThreads();
  uint32  blockSize    = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

  uint32  nContained   = 0;
  uint32  nSingleton   = 0;
  uint32  nSpur        = 0;
  uint32  nSpur1Mutual = 0;
  uint32  nBoth        = 0;
  uint32  nBoth1Mutual = 0;
  uint32  nBoth2Mutual = 0;

  for (uint32 fi=1; fi <= fiLimit; fi++) {
    BestEdgeOverlap *this5 = getBestEdgeOverlap(fi, false);
    BestEdgeOverlap *this3 = getBestEdgeOverlap(fi, true);

    //  Count contained reads

    if (isContained(fi)) {
      nContained++;
      continue;
    }

    //  Count singleton reads

    if ((this5->isUnset() == true) && (this3->isUnset() == true)) {
      nSingleton++;
      continue;
    }

    //  Compute mutual bestedness

    bool  mutual5 = false;
    bool  mutual3 = false;

    if (this5->isValid() == true) {
      BestEdgeOverlap *that5 = getBestEdgeOverlap(this5->readId(), this5->read3p());

      mutual5 = ((that5->readId() == fi) && (that5->read3p() == false));
    }

    if (this3->isValid() == true) {
      BestEdgeOverlap *that3 = getBestEdgeOverlap(this3->readId(), this3->read3p());

      mutual3 = ((that3->readId() == fi) && (that3->read3p() == true));
    }

    //  Compute spur, and mutual best

    if ((this5->isUnset() == true) ||
        (this3->isUnset() == true)) {
      nSpur++;
      nSpur1Mutual += (mutual5 || mutual3) ? 1 : 0;
      continue;
    }

    //  Otherwise, both edges exist

    nBoth++;
    nBoth1Mutual +=  (mutual5 != mutual3) ? 1 : 0;
    nBoth2Mutual += ((mutual5 == true) && (mutual3 == true)) ? 1 : 0;
  }

  fprintf(report, "\n");
  fprintf(report, "%s EDGES\n", label);
  fprintf(report, "-------- ----------------------------------------\n");
  fprintf(report, "%8u reads are contained\n", nContained);
  fprintf(report, "%8u reads have no best edges (singleton)\n", nSingleton);
  fprintf(report, "%8u reads have only one best edge (spur) \n", nSpur);
  fprintf(report, "         %8u are mutual best\n", nSpur1Mutual);
  fprintf(report, "%8u reads have two best edges \n", nBoth);
  fprintf(report, "         %8u have one mutual best edge\n", nBoth1Mutual);
  fprintf(report, "         %8u have two mutual best edges\n", nBoth2Mutual);
  fprintf(report, "\n");
}



void
BestOverlapGraph::outputOverlaps(const char *prefix, const char *label, bool allOverlaps) {
  char            ovlName[FILENAME_MAX+1];

  snprintf(ovlName, FILENAME_MAX, "%s.%s.ovlStore", prefix, label);
  ovStoreWriter  *writer = new ovStoreWriter(ovlName, RI->seqStore());

  //  The overlap needs a pointer to the seqStore.  Normally this is set
  //  when the ovlStore is opened,

  ovOverlap       ovl;

  //  Iterate over all reads.  If an overlap is good quality, save it to the store.

  for (uint32 fi=1; fi <= RI->numReads(); fi++) {
    uint32      no   = 0;
    BAToverlap *ovls = OC->getOverlaps(fi, no);

    if (isIgnored(fi) == true)
      continue;

    for (uint32 ii=0; ii<no; ii++) {
      if ((allOverlaps == true) || (isOverlapBadQuality(ovls[ii]) == false)) {
        ovls[ii].convert(ovl);
        writer->writeOverlap(&ovl);
      }
    }
  }

  delete writer;
}



void
BestOverlapGraph::checkForContainedDovetails(void) const {
  uint32           fiLimit = RI->numReads();
  uint32           err5    = 0;
  uint32           err3    = 0;

  for (uint32 fi=1; fi <= fiLimit; fi++) {
    if (isContained(fi) == true) {
      if (getBestEdgeOverlap(fi, false)->isUnset() == false)   err5++;
      if (getBestEdgeOverlap(fi,  true)->isUnset() == false)   err3++;
    }
  }

  assert(err5 == 0);
  assert(err3 == 0);
}



void
BestOverlapGraph::checkForCovGapEdges(void) const {
  uint32           fiLimit = RI->numReads();
  uint32           err5    = 0;
  uint32           err3    = 0;

  for (uint32 fi=1; fi <= fiLimit; fi++) {
    if (isCoverageGap(fi) == true) {
      if (getBestEdgeOverlap(fi, false)->isUnset() == false)   err5++;
      if (getBestEdgeOverlap(fi,  true)->isUnset() == false)   err3++;
    }
  }

  assert(err5 == 0);
  assert(err3 == 0);
}



BestOverlapGraph::BestOverlapGraph(double            erateGraph,
                                   double            erateMax,
                                   double            erateForced,
                                   double            percentileError,
                                   double            deviationGraph,
                                   double            minOlapPercent,
                                   double            minReadsBest,
                                   const char       *prefix,
                                   covgapType        covGapType,        uint32  covGapOlap,
                                   bool              filterHighError,
                                   bool              filterLopsided,    double  lopsidedDiff,
                                   bool              filterSpur,        uint32  spurDepth,
                                   BestOverlapGraph *BOG) :
  _erateGraph(erateGraph),
  _erateMax(erateMax),
  _erateForced(erateForced),
  _percentileError(percentileError),
  _deviationGraph(deviationGraph),
  _minOlapPercent(minOlapPercent),
  _minReadsBest(minReadsBest) {

  FILE *report = merylutil::openOutputFile(prefix, '.', "best.report");

  writeStatus("\n");
  writeStatus("BestOverlapGraph()-- Computing Best Overlap Graph.\n");
  writeStatus("BestOverlapGraph()-- Allocating best edges (" F_SIZE_T "MB).\n",
           ((2 * sizeof(BestEdgeOverlap) * (RI->numReads() + 1)) >> 20));

  _reads               = new BestEdgeRead [RI->numReads() + 1];
  _errorLimit          = erateGraph;

  _best5score          = new uint64 [RI->numReads() + 1];   //  Cleared in findEdges().
  _best3score          = new uint64 [RI->numReads() + 1];

  //  If there is a BOG supplied, copy the reads from there to here.
  //
  //  This sets the status of orphans and bubbles.  Then we'll further flag
  //  those as ignored.
  //
  //  Edges will be recomputed on the first findEdges() call below, but we still need
  //  to clear all of them here.

  if (BOG) {
    memcpy(_reads, BOG->_reads, sizeof(BestEdgeRead) * (RI->numReads() + 1));

    for (uint32 fi=1; fi <= RI->numReads(); fi++) {
      _reads[fi]._best5.clear();
      _reads[fi]._best3.clear();

      if ((isOrphan(fi) == true) ||
          (isBubble(fi) == true)) {
        writeLog("IGNORE read %u %s %s\n", fi, isOrphan(fi) ? "orphan" : "", isBubble(fi) ? "bubble" : "");
        setIgnored(fi);
      }
    }
  }

  //
  //  Find initial edges.  erateGraph is tried first, but if 90% of
  //  (eligible) reads do not have two best edges, we'll fall back to using
  //  all overlaps we know about (erateMax).
  //
  //  With some kind of best edge on the reads, next analyze those edges to
  //  set a cutoff on overlap quality used for graph building.  Once we have
  //  the magic error rate limit, recompute best edges.
  //
  //  This should be done before removing coverageGap reads, so we can skip
  //  high-error overlaps (that would otherwise mask a problematic read).
  //  On one hifi set, there was no difference between doing it as here,
  //  and removing coverageGap reads first.
  //

  if (logFileFlagSet(LOG_BEST_OVERLAPS) || logFileFlagSet(LOG_SYMMETRIC_OVERLAPS))    //  Output all overlaps if
    outputOverlaps(prefix, "0.all", logFileFlagSet(LOG_SYMMETRIC_OVERLAPS));          //  LOG_SYMMETRIC_OVERLAPS

  findInitialEdges();                 //  Sets _errorLimit to either _erateGraph or _erateMax

  if (logFileFlagSet(LOG_BEST_EDGES))
    reportBestEdges(prefix, "0.initial");

  if (filterHighError) {
    writeStatus("BestOverlapGraph()-- Filtering high error edges.\n");

    findErrorRateThreshold(report);   //  Set the final threshold.
    findContains();                   //  Recompute contained reads; remove those contained in high-error parents.
    findEdges(true);                  //  Recompute best edges.

    if (logFileFlagSet(LOG_BEST_OVERLAPS))
      outputOverlaps(prefix, "1.filtered", false);

    writeStatus("BestOverlapGraph()--   Ignore overlaps with more than %.6f%% error.\n", 100.0 * _errorLimit);

    if (logFileFlagSet(LOG_BEST_EDGES))
      reportBestEdges(prefix, "1.filtered");

    checkForContainedDovetails();
    checkForCovGapEdges();
  }

  else {
    writeStatus("BestOverlapGraph()-- NOT filtering high error edges.\n");
  }

  //
  //  Mark reads as coverageGap if they are not fully covered by (good) overlaps.
  //

  if (covGapType != covgapNone) {
    writeStatus("BestOverlapGraph()-- Filtering reads with a gap in overlap coverage.\n");

    removeReadsWithCoverageGap(prefix,
                               covGapType,
                               covGapOlap);           //  Remove crappy reads.
    findContains();                                   //  Recompute contained reads; remove those contained in covGap reads.
    findEdges(true);                                  //  Recompute best edges.

    if (logFileFlagSet(LOG_BEST_OVERLAPS))
      outputOverlaps(prefix, "2.covGap", false);

    writeStatus("BestOverlapGraph()--   %u reads removed.\n", numCoverageGap());

    if (logFileFlagSet(LOG_BEST_EDGES))
      reportBestEdges(prefix, "2.covGap");

    checkForContainedDovetails();
    checkForCovGapEdges();
  }

  else {
    writeStatus("BestOverlapGraph()-- NOT filtering reads with a gap in overlap coverage.\n");
  }

  //
  //  Report initial statistics.
  //

  reportEdgeStatistics(report, "INITIAL");

  //
  //  Mark reads as lopsided if the length of the best edge out is very
  //  different than the length of the best edge that should be back to us.
  //  E.g., if readA has best edge to readB (of length lenAB), but readB has
  //  best edge to readC (of length lenBC), and lenAB is much shorter than
  //  lenBC, then something is wrong with readA.
  //

  if (lopsidedDiff > 0) {
    writeStatus("BestOverlapGraph()-- Filtering reads with lopsided best edges (more than %.2f%% different).\n", lopsidedDiff);

    removeLopsidedEdges(prefix, "lopsided", lopsidedDiff);   //  Remove reads that look weird (not that we discriminate against weirdness!)
    //findContains();                                        //  DO NOT recompute contained reads.
    findEdges(false);                                        //  Recompute best edges that have no existing eddge.

    if (logFileFlagSet(LOG_BEST_OVERLAPS))
      outputOverlaps(prefix, "3.lopsided", false);

    writeStatus("BestOverlapGraph()--   %u reads have lopsided edges.\n", numLopsided());

    if (logFileFlagSet(LOG_BEST_EDGES))
      reportBestEdges(prefix, "3.lopsided");

    checkForContainedDovetails();
    checkForCovGapEdges();
  }

  else {
    writeStatus("BestOverlapGraph()-- NOT filtering reads with lopsided best edges.\n");
  }

  //
  //  Now that we know the max difference allowed in overlaps, mark reads as
  //  spurs if they're spurs.  Then don't find best edges to them.
  //
  //  Spanned spurs searches the graph for dead ends, then backs up to a
  //  previous branch.  It (supposedly) does this such that the spur path is
  //  preserved unless a branch is found.
  //
  //  Do NOT call findEdges() after this.  It will RESET all the work done by
  //  removeSpannedSpurs().
  //

  if (filterSpur) {
    writeStatus("BestOverlapGraph()-- Filtering spur reads.\n");

    removeSpannedSpurs(prefix, spurDepth);

    //  Verify, manually, that findEdges(false) does not change any of
    //  the spur removal results.
    //reportBestEdges(prefix, "9.beforeFindEdges");
    //findEdges(false);
    //reportBestEdges(prefix, "9.afterFindEdges");

    if (logFileFlagSet(LOG_BEST_EDGES))
      reportBestEdges(prefix, "4.spurs");

    checkForContainedDovetails();
    checkForCovGapEdges();
  }

  else {
    writeStatus("BestOverlapGraph()-- NOT filtering spur reads.\n");
  }

  //
  //  We tried a second pass of lopsided end removal here, but didn't like
  //  it.  It significantly affected HiFi humans; chm13 20kb library
  //  assembled worse than the 10kb library.  Almost all assemblies are down
  //  to 15 Mbp N50s.  BAC resolution is worse.
  //

  //
  //  All done!  Do some final checks and cleanup, dump various logs and reports.
  //

  if (logFileFlagSet(LOG_BEST_OVERLAPS))
    outputOverlaps(prefix, "4.spur-removal", false);

  reportBestEdges(prefix, "best");

  reportEdgeStatistics(report, "FINAL");

  checkForContainedDovetails();
  checkForCovGapEdges();

  fprintf(report, "\n");
  fprintf(report, "EDGE FILTERING\n");
  fprintf(report, "-------- ------------------------------------------\n");
  fprintf(report, "%8u reads are ignored\n",  numIgnored());
  fprintf(report, "%8u reads have a gap in overlap coverage\n", numCoverageGap());
  fprintf(report, "%8u reads have lopsided best edges\n", numLopsided());

  //  Done with scoring data.

  delete [] _best5score;    _best5score = NULL;
  delete [] _best3score;    _best3score = NULL;

  merylutil::closeFile(report);

  setLogFile(prefix, NULL);
}
