/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2006) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

#ifndef FORCE_DEBUG
     #define NDEBUG
#endif

#include <set>

#include "Alignment.h"
#include "Basevector.h"
#include "CoreTools.h"
#include "Equiv.h"
#include "Feudal.h"
#include "ReadPairing.h"
#include "math/Functions.h"
#include "math/HoInterval.h"
#include "paths/AlignHyperKmerPath.h"
#include "paths/AlignSeqsToHyper.h"
#include "paths/AlignPairsToHyper.h"
#include "paths/EvalUtils.h"
#include "paths/GlobalCleanAnnex.h"
#include "paths/Hospital.h"
#include "paths/HyperKmerPath.h"
#include "paths/KmerBaseBroker.h"
#include "paths/PullApartHyperKmerPath.h"
#include "paths/SeqOnHyper.h"
#include "paths/SimpleLoop.h"


/* Function: CleanupHyperKmerPath
   Performs basic tidying of the hyperkmerpath:
*/
void CleanupHyperkmerPath(HyperKmerPath& h) {
  h.RemoveUnneededVertices( );
  h.ReduceLoops( );
  h.CompressEdgeObjects( );
  h.RemoveDeadEdgeObjects( );
  h.RemoveEdgelessVertices( );
}


/* Function: FindInsertSizes
   Returns a list of insert sizes, but lumping them together (under the largest)
   if they are within 1% of each other
*/
vec<int> FindInsertSizes(const vec<read_pairing>& pairs) {
  set<int> all_insert_sizes;
  for (vec<read_pairing>::const_iterator p = pairs.begin(); p != pairs.end(); p++ )
    all_insert_sizes.insert(p->sep);
  
  vec<int> insert_sizes;
  set<int>::reverse_iterator s = all_insert_sizes.rbegin();
  insert_sizes.push_back(*s);
  for(s++; s != all_insert_sizes.rend(); s++)
    if( *s < .99 * insert_sizes.back() )
      insert_sizes.push_back(*s);
  insert_sizes.ReverseMe();

  return insert_sizes;
}

/* Function: TrackHkpChanges
   Aligns the current HyperKmerPath to the reference and prints the result
*/
void TrackHkpChanges( const String& message, const HyperKmerPath& h, 
		      const KmerBaseBroker* kbb, const String& data_dir,
		      const String& tmp_dir) {
  String bar = "##############################################################"
    "######################\n";
  cout << "\n" << bar << "\n" << message << "\n";
  AlignAndPrintHyperKmerPath( cout, h, kbb, data_dir + "/genome", tmp_dir );
  cout << "\n" << bar << "\n";
}


void IsomorphicComponentCheck(String outDir, const HyperKmerPath& h) {
  static int counter = 0;
  HyperKmerPath l = h;
  l.RemoveDuplicateEdges();
  l.CanonicalizeEdges();
  equiv_rel componentRelation;
  vec<int> isomorphicComponentReps;
  if (l.FindIsomorphicComponents(componentRelation, isomorphicComponentReps)) {
    cout << "Isomorphic Component Check Failed:" << endl;
    cout <<  "Graph has " << h.ConnectedComponents() << " components and "
       << h.N() << " vertices" << endl;

    cout << "Graph has " << isomorphicComponentReps.isize() 
	 << " isomorphic components" << endl;
    cout << "See isomorphic dot file " << ++counter << endl;
    // Write component to dot file for reference
    Ofstream( dot, outDir + "/hyper.isomorphic." + ToString(counter) + ".dot" );
    h.PrintSummaryDOT0w(dot, True, False, False, &isomorphicComponentReps);
  }
}


// Remove short hanging ends.  Look for
//
//                 x
//                 |
//                 e
//                 |
//        u --c--> v --d--> w
//
// where x is a source or sink, e is short (and can go either way), whereas
// c and d are long.

void RemoveHangingEnds( HyperKmerPath& h ) {

  for ( int x = 0; x < h.N( ); x++ ) {
    
    // Check that basic assumptions are satisfied, including length(e) <= 5kb.
    
    int v, c, d, e;
    if ( h.Source(x) && h.From(x).size( ) == 1 ) {
      v = h.From(x)[0];
      e = h.EdgeObjectIndexByIndexFrom( x, 0 );
    } else if ( h.Sink(x) && h.To(x).size( ) == 1 ) {
      v = h.To(x)[0];
      e = h.EdgeObjectIndexByIndexTo( x, 0 );
    } else 
      continue;

    if ( h.EdgeLength(e) > 5000 )
      continue;

    if ( h.Source(x) ) {
      if ( !( h.From(v).size( ) == 1 && h.To(v).size( ) == 2 ) )
	continue;
      d = h.EdgeObjectIndexByIndexFrom( v, 0 );
      c = h.EdgeObjectIndexByIndexTo( v, 0 );
      if ( c == e ) 
	c = h.EdgeObjectIndexByIndexTo( v, 1 );
    } else {
      if ( !( h.From(v).size( ) == 2 && h.To(v).size( ) == 1 ) ) 
	continue;
      c = h.EdgeObjectIndexByIndexTo( v, 0 );
      d = h.EdgeObjectIndexByIndexFrom( v, 0 );
      if ( d == e ) 
	d = h.EdgeObjectIndexByIndexFrom( v, 1 );
    }

    // We require that there is an edge "competing with e", that is at least
    // 20 times longer.
    
    static vec<vrtx_t> v_only(1), to_v, from_v;
    v_only[0] = v;
    int max_competitor = 0;
    if ( h.Source(x) ) {
      h.digraph::GetPredecessors( v_only, to_v );
      for ( int j = 0; j < to_v.isize( ); j++ ) {
	int z = to_v[j];
	for ( int i = 0; i < h.To(z).isize( ); i++ ) {
	  const KmerPath& e = h.EdgeObjectByIndexTo( z, i );
	  max_competitor = Max( max_competitor, e.KmerCount( ) );
	}
      }
    } else {
      h.digraph::GetSuccessors( v_only, from_v );
      for ( int j = 0; j < from_v.isize( ); j++ ) {
	int z = from_v[j];
	for ( int i = 0; i < h.From(z).isize( ); i++ ) {
	  const KmerPath& e = h.EdgeObjectByIndexFrom( z, i );
	  max_competitor = Max( max_competitor, e.KmerCount( ) );
	}
      }
    }

    if ( 20 * h.EdgeLength(e) > max_competitor )
      continue;
    
    // Edit the graph.
    
    if ( h.Source(x) ) 
      h.DeleteEdgeFrom( x, 0 );
    else 
      h.DeleteEdgeTo( x, 0 );

  }
}


void ReachForward( int e, const vec<int>& all_edges, 
		   const vec< VertexPair >& pair_places,
		   const vec< vec<int> >& pair_places_index, 
		   const vec<CompressedSeqOnHyper>& csaligns, 
		   const vec< vec<int> >& csaligns_index,
		   const vec< vec<int> >& csaligns_indexp, 
		   const vec<int>& pairs_index, const Bool require_unique,
		   const vec<Bool>& hits_unique, const vec<read_location>& readlocs,
		   const vec<int>& readlocs_index, Bool verbose, vec<int>& reaches_to,
		   const HyperBasevector& hb ) {

  reaches_to.clear( );
  for ( int u = 0; u < pair_places_index[e].isize( ); u++ ) {
    int pp = pair_places_index[e][u];
    int i1 = pair_places[pp].first, i2 = pair_places[pp].second;
    if ( require_unique && !hits_unique[i1] )
      continue;
    if ( require_unique && !hits_unique[i2] )
      continue;
	  
    //           const SeqOnHyper &ap1 = saligns[i1], &ap2 = saligns[i2];
    //           int N1 = ap1.N( ), N2 = ap2.N( );
    //           if ( N1 != 1 || ap1.Id2(0) != e ) continue;
    //           int id1 = ap1.Id1( ), id2 = ap2.Id1( );
    //           if ( !csaligns_index[id1].solo( ) ) continue;
    //           if ( !csaligns_indexp[ pairs_index[id1] ].solo( ) ) continue;
    //           if ( N2 == 1 && ap1.Id2(0) == ap2.Id2(0) ) continue;

    const CompressedSeqOnHyper &cap1 = csaligns[i1], &cap2 = csaligns[i2];

    if( ! cap1.SingleEdge() || cap1.Part0().Id2() != e )
      continue;
    int id1 = cap1.Id1( ), id2 = cap2.Id1( );
    if ( !csaligns_index[id1].solo( ) )
      continue;
    if ( !csaligns_indexp[ pairs_index[id1] ].solo( ) )
      continue;
    if ( cap2.SingleEdge() && cap1.Part0().Id2() == cap2.Part0().Id2() ) 
      continue;

    // Only now do we decompress.
    SeqOnHyper ap1, ap2;
    cap1.DecompressInto(ap1, hb);
    cap2.DecompressInto(ap2, hb);
    int N1 = ap1.N( ), N2 = ap2.N( );
    
    Bool off = False;
    for ( int m = 0; m < N2; m++ )
      if ( !BinMember( all_edges, ap2.Id2(m) ) ) off = True;

    if (off) continue;

    if (verbose) {
      cout << "\n";
      if (require_unique)
	cout << "unique-";
      cout << "linking from " << BaseAlpha( ap1.Id2(0) ) << "."
	   << ap1.pos2(0) << "-" << ap1.Pos2(0) << " to ";
      for ( int m = 0; m < N2; m++ ) {
	if ( m > 0 ) 
	  cout << ",";
	cout << BaseAlpha( ap2.Id2(m) );
      }
      cout << "." << ap2.pos2(N2-1) << "-" << ap2.Pos2(N2-1) << "\n";
      if ( readlocs.nonempty( ) ) {
	cout << readlocs[ readlocs_index[id1] ];
	cout << readlocs[ readlocs_index[id2] ];
      }
    }

    for ( int m = 0; m < N2; m++ )
      reaches_to.push_back( ap2.Id2(m) );

  }

  UniqueSort(reaches_to);
}

void ReadsHittingUniqueKmers( const HyperKmerPath& h, const HyperBasevector& hb,
			      const vec<tagged_rpint>& uniqdb20,
			      const vec<CompressedSeqOnHyper>& csaligns,
			      vec<Bool>& hits_unique ) {

  // Mark (putatively) unique kmers on edges.
  
  int nedges = h.EdgeObjectCount( );
  vec<int> L(nedges); 
  for ( int i = 0; i < nedges; i++ )
    L[i] = h.EdgeLength(i);

  vec< vec<ho_interval> > huniq(nedges);
  for ( int i = 0; i < nedges; i++ ) {
    static vec<ho_interval> huniq0;
    huniq0.clear( );
    const KmerPath& e = h.EdgeObject(i);
    int pos = 0;
    for ( int j = 0; j < e.NSegments( ); j++ ) {
      static vec<longlong> places;
      const KmerPathInterval& J = e.Segment(j);
      Contains( uniqdb20, J, places );
      for ( int u = 0; u < places.isize( ); u++ ) {
	const tagged_rpint& t = uniqdb20[ places[u] ];
	int start = Max( 0, int( t.Start( ) - J.Start( ) ) );
	int stop = Min( J.Length( ), int( t.Stop( ) - J.Start( ) ) );
	huniq0.push_back( ho_interval( pos+start, pos+stop ) );
      }
      pos += J.Length( );
    }
    ExtractGivenCoverage( L[i], 1, huniq0, huniq[i] );
  }

  // Determine which read alignments hit unique kmers.
  
  hits_unique.resize_and_set( csaligns.size( ), False );
  for ( int i = 0; i < csaligns.isize( ); i++ ) {
    static SeqOnHyper ap;
    csaligns[i].DecompressInto(ap, hb);
    for ( int j = 0; j < ap.N( ); j++ ) {
      int id2 = ap.Id2(j);
      ho_interval H( ap.pos2(j), ap.Pos2(j) - h.K( ) + 1 );
      if ( Overlap( H, huniq[id2] ) > 0 )
	hits_unique[i] = True;
    }
  }
}

/* Function: FindLinksBetweenComponents
   Find links between components from read pair information
   Does NOT do anything to the HyperKmerPath, just displays a list of links
*/
void FindLinksBetweenComponents(const HyperKmerPath& h, const HyperBasevector& hb,
				const vec<tagged_rpint>& uniqdb20,
				const vec<CompressedSeqOnHyper>& csaligns,
				const vec<read_pairing>& pairs,
				const int nreads) {

  double clock = WallClockTime( );
  cout << "\nLinks between components:\n";
  equiv_rel e;
  h.ComponentRelation(e);
  vec<int> reps;
  e.OrbitRepsAlt(reps);
  vec< vec<int> > saligns_index(nreads);
  for ( int i = 0; i < csaligns.isize( ); i++ )
    saligns_index[ csaligns[i].Id1( ) ].push_back(i);
  vec<vrtx_t> to_right_vertex;
  h.ToRight(to_right_vertex);
  vec<Bool> hits_unique;
  ReadsHittingUniqueKmers( h, hb, uniqdb20, csaligns, hits_unique );
  vec<String> links;
  for ( int i = 0; i < pairs.isize( ); i++ ) {
    int id1 = pairs[i].id1, id2 = pairs[i].id2;
    if ( !saligns_index[id1].solo( ) )
      continue;
    if ( !saligns_index[id2].solo( ) )
      continue;
    int i1 = saligns_index[id1][0];
    int i2 = saligns_index[id2][0];
    if ( !hits_unique[i1] || !hits_unique[i2] )
      continue;

    const CompressedSeqOnHyper &cs1 = csaligns[i1], &cs2 = csaligns[i2];
    int j1 = cs1.Part0( ).Id2( ), j2 = cs2.Part0( ).Id2( );
    int v1 = to_right_vertex[j1], v2 = to_right_vertex[j2];
    int c1 = BinPosition( reps, e.ClassId(v1) );
    int c2 = BinPosition( reps, e.ClassId(v2) );
    if ( c1 == c2 )
      continue;
    ostrstream out;
    out << c1 << ( cs1.Rc1( ) ? "rc" : "fw" ) << " <--> "
	<< c2 << ( cs2.Rc1( ) ? "rc" : "fw" ) << ends;
    links.push_back( out.str( ) );
  }

  Sort(links);

  for ( int i = 0; i < links.isize( ); i++ ) {
    int j;
    for ( j = i + 1; j < links.isize( ); j++ )
      if ( links[j] != links[i] )
	break;

    cout << "[" << j-i << "] " << links[i] << "\n";
    i = j - 1;
  }

  cout << "\n" << TimeSince(clock) << " used finding links" << endl;
}

// Pull apart along distinct paths (as defined by pairs) that run from source
// edges to sink edges.
void PullApartSourceToSink ( HyperKmerPath& h, const KmerBaseBroker* kbb,
			     const vecbasevector& reads, const String& sub_dir,
			     vec<CompressedSeqOnHyper>& csaligns,
			     vec<vec<int> >& csaligns_index,
			     vec< VertexPair >& pair_places,
			     const vec<read_pairing>& pairs,
			     const vec<int>& pairs_index,
			     const Bool SHOW_PAIR_ALIGNS,
			     Bool verbose, const vec<tagged_rpint>& uniqdb,
			     const vec<tagged_rpint>& uniqdb20,
			     const vec<tagged_rpint>& unipathsxdb,
			     const vec<int>& predicted_copyno,
			     const int MIN_COMPONENT,
			     const vec<read_location>& readlocs,
			     const vec<int>& readlocs_index,
			     const Bool USE_TRUTH,
			     const String& data_dir, const String& tmp_dir,
			     const Bool track_changes
) {
  // We go through the HyperKmerPath twice, once forward, and once backwards.

  int nreads = reads.size( );
  for ( int opass = 1; opass <= 2; opass++ ) {
    cout << Date( ) << ": Pull Apart Source To Sink Loop #" << opass << endl;

    // Rebuild alignments.

    double clock = WallClockTime( );
    HyperBasevector hb2( h, *kbb );
    AlignPairsToHyper( h, hb2, reads, pairs, sub_dir, csaligns, csaligns_index,
		       pair_places, SHOW_PAIR_ALIGNS );
    cout << TimeSince(clock) << " used rebuilding alignments" << endl;

    // For each edge, find the predicted copy number n of the unipath meeting
    // it that has lowest predicted copy number.  It follows that if the
    // unipaths are correct, and the predicted copy numbers are correct, than
    // the edge cannot perfectly match the genome more than n times.  Note
    // that the edge might have kmers that were created after the unipaths 
    // were created, and these won't appear in the unipaths.
    vec<int> edge_copyno;
    MaxEdgeCopyNumber( h, unipathsxdb, predicted_copyno, edge_copyno );

    // Set up pair index.
    vec<vec<int> > csaligns_indexp( pairs.isize() );
    for ( int i = 0; i < pair_places.isize( ); i++ ) {
      const CompressedSeqOnHyper& cap1 = csaligns[ pair_places[i].first ];
      csaligns_indexp[ pairs_index[ cap1.Id1( ) ] ].push_back(i);
    }

    // Compute weighted number of reads landing on each edge.
    
    vec<double> readhits( h.EdgeObjectCount( ), 0 );
    for ( int i = 0; i < csaligns.isize( ); i++ ) {
      static SeqOnHyper s;
      csaligns[i].DecompressInto(s, hb2);
      for ( int j = 0; j < s.N( ); j++ ) {
	readhits[ s.Id2(j) ] += 1.0 / double( csaligns_index[ s.Id1( ) ].size( ) );
      }
    }

    // Determine which read alignments hit unique kmers.
    
    vec<Bool> hits_unique;
    ReadsHittingUniqueKmers( h, hb2, uniqdb20, csaligns, hits_unique );

    // Try again to pull apart overcollapsed components.  For this, we look
    // for components having more than one source edge.  We use read pairs to
    // reach forward from those edges s, requiring that the first read of the
    // pair land fully on s, and have no other placement, and that the read
    // pair have only one placement on the HyperKmerPath.

    if (verbose && USE_TRUTH)
      TrackHkpChanges("At final pull-apart.  Before:", h, kbb, data_dir, tmp_dir);
    else if (verbose)
      h.PrintSummaryPlus(cout);

    vec< vec<int> > pair_places_index( h.EdgeObjectCount( ) );
    for ( int i = 0; i < pair_places.isize( ); i++ ) {
      static SeqOnHyper ap1, ap2;
      csaligns[ pair_places[i].first ].DecompressInto(ap1, hb2);
      csaligns[ pair_places[i].second ].DecompressInto(ap2, hb2);
      for ( int j = 0; j < ap1.N( ); j++ )
	pair_places_index[ ap1.Id2(j) ].push_back(i);
      for ( int j = 0; j < ap2.N( ); j++ )
	pair_places_index[ ap2.Id2(j) ].push_back(i);
    }

    equiv_rel e;
    h.ComponentRelation(e);
    vec<int> reps;
    e.OrbitRepsAlt(reps);
    vec<vrtx_t> to_right_vertex, to_left_vertex;
    h.ToRight(to_right_vertex), h.ToLeft(to_left_vertex);
    longlong readbases = 0;
    for ( int i = 0; i < nreads; i++ )
      readbases += reads[i].size( );
    double mean_read = double(readbases) / double(nreads);
    int mean_kmers_in_read = int(floor(mean_read)) - h.K( ) + 1;
    vec<int> edgelengths;
    for ( int i = 0; i < h.EdgeObjectCount( ); i++ )
      edgelengths.push_back( h.EdgeObject(i).KmerCount( ) );
    double read_rate = double(nreads) / double( BigSum(edgelengths) );
    vec< vec<int> > C;
    for ( int i = 0; i < reps.isize( ); i++ ) {
      static vec<vrtx_t> o;
      static vec<int> sources;
      static vec<int> all_edges;
      e.Orbit( reps[i], o );
      sources.clear( ), all_edges.clear( );
      for ( int j = 0; j < o.isize( ); j++ ) {
	if ( h.Source( o[j] ) ) {
	  for ( int u = 0; u < h.From( o[j] ).isize( ); u++ ) {
	    sources.push_back( h.EdgeObjectIndexByIndexFrom( o[j], u ) );
	  }
	}
      }

      for ( int j = 0; j < o.isize( ); j++ ) {
	int v = o[j];
	for ( int k = 0; k < h.From(v).isize( ); k++ ) {
	  int e = h.EdgeObjectIndexByIndexFrom( v, k );
	  all_edges.push_back(e);
	}
      }

      Sort(all_edges);

      if ( sources.size( ) <= 1 ) {
	C.push_back(all_edges);
	continue;
      }

      static vec< vec<int> > reach;
      reach.clear( );
      reach.resize( sources.size( ) );
      double max_cov = 1.5;
      for ( int j = 0; j < sources.isize( ); j++ ) {
	int s = sources[j];
	int nstarts = edgelengths[s] + mean_kmers_in_read - 1;
	double cov = ( readhits[s]/double(nstarts) ) / read_rate;
	if ( cov > max_cov )
	  continue;
	reach[j].push_back(s);
      }

      // There are two passes.  On the first pass, we only use reads that
      // hit unique sequence.  On the second pass, we refuse to adjoin
      // edges that were obtained on the first pass.
      
      vec<int> pass1_hits;
      for ( int pass = 1; pass <= 2; pass++ ) {
	if ( pass == 2 ) {
	  for ( int j = 0; j < sources.isize( ); j++ )
	    pass1_hits.append( reach[j] );

	  UniqueSort(pass1_hits);
	}

	for ( int j = 0; j < sources.isize( ); j++ ) {
	  while(1) {
	    int old_reaches = reach[j].size( );
	    for ( int u = 0; u < old_reaches; u++ ) {
	      int e = reach[j][u];
	      int nstarts = edgelengths[e] + mean_kmers_in_read - 1;
	      double cov = ( readhits[e]/double(nstarts) ) / read_rate;
	      if ( pass == 2 && cov > max_cov )
		continue;
	      if (verbose) {
		cout << "\nreaching forward from edge " << BaseAlpha(e) 
		     << ", cov = " << cov << "\n";
	      }

	      static vec<int> new_reach;
	      ReachForward( e, all_edges, pair_places, 
			    pair_places_index, csaligns, csaligns_index, 
			    csaligns_indexp, pairs_index, pass == 1,
			    hits_unique, readlocs, readlocs_index, 
			    verbose, new_reach, hb2 );

	      if ( pass == 1 ) 
		reach[j].append(new_reach);
	      else {
		for ( int r = 0; r < new_reach.isize(); r++ ) {
		  if ( !BinMember( pass1_hits, new_reach[r] ) ) {
		    reach[j].push_back( new_reach[r] );
		  }
		}
	      }
	    }

	    UniqueSort( reach[j] );
	    if ( reach[j].isize( ) == old_reaches ) 
	      break;
	  }
	}
      }
      if (verbose) {
	cout << "\nPropose splitting component " << i << ":\n";
	for ( int j = 0; j < reach.isize( ); j++ ) {
	  cout << "[" << j+1 << "]:";
	  for ( int u = 0; u < reach[j].isize( ); u++ ) {
	    int e = reach[j][u];
	    cout << " " << BaseAlpha(e) << "[" << h.EdgeLength(e) << "]";
	  }
	  cout << "\n";
	}
      }    

      vec<int> used_edges, multiple_edges, multiple_edges_count, missing_edges;
      for ( int j = 0; j < reach.isize( ); j++ )
	used_edges.append( reach[j] );
      Sort(used_edges);
      for ( int j = 0; j < used_edges.isize( ); j++ ) {
	int k;
	for ( k = j + 1; k < used_edges.isize( ); k++ )
	  if ( used_edges[k] != used_edges[j] )
	    break;
	if ( k - j > 1 ) {
	  multiple_edges.push_back( used_edges[j] );
	  multiple_edges_count.push_back( k - j );
	}
	j = k - 1;
      }

      UniqueSort(used_edges);

      for ( int j = 0; j < o.isize( ); j++ ) {
	int v = o[j];
	for ( int u = 0; u < h.From(v).isize( ); u++ ) {
	  int e = h.EdgeObjectIndexByIndexFrom( v, u );
	  if ( !BinMember( used_edges, e ) )
	    missing_edges.push_back(e);
	}
      }

      Sort(missing_edges);

      int total_found = 0;
      for ( int j = 0; j < reach.isize( ); j++ ) {
	for ( int u = 0; u < reach[j].isize( ); u++ )
	  total_found += h.EdgeLength( reach[j][u] );
      }
      vec<Bool> removem( missing_edges.size( ), False );
      for ( int u = 0; u < missing_edges.isize( ); u++ ) {
	int e = missing_edges[u];
	if ( edge_copyno[e] <= 1 )
	  continue;
	removem[u] = True;
	for ( int j = 0; j < reach.isize( ); j++ )
	  reach[j].push_back(e);
      }

      for ( int j = 0; j < reach.isize( ); j++ )
	Sort( reach[j] );
      EraseIf( missing_edges, removem );
      int total_missing = 0;
      for ( int j = 0; j < missing_edges.isize( ); j++ ) {
	int e = missing_edges[j];
	total_missing += h.EdgeLength(e);
      }
      if ( double(total_missing)/double(total_found) <= 0.05 ) {
	for ( int u = 0; u < missing_edges.isize( ); u++ ) {
	  int e = missing_edges[u];
	  for ( int j = 0; j < reach.isize( ); j++ )
	    reach[j].push_back(e);
	}
	for ( int j = 0; j < reach.isize( ); j++ )
	  Sort( reach[j] );
      } else {
	if (verbose) {
	  cout << "There are edges not placed in any subcomponent:\n";
	  for ( int j = 0; j < missing_edges.isize( ); j++ ) {
	    int e = missing_edges[j];
	    cout << BaseAlpha(e) << "[" << h.EdgeLength(e) << "]\n";
	  }
	}
	C.push_back(all_edges);
	continue;
      }

      int total_bad_multiples = 0;
      for ( int j = 0; j < multiple_edges.isize( ); j++ ) {
	int e = multiple_edges[j];
	if ( multiple_edges_count[j] <= edge_copyno[e] ) 
	  continue;
	total_bad_multiples += h.EdgeLength(e);
      }

      if ( double(total_bad_multiples)/double(total_found) > 0.05 ) {
	for ( int j = 0; j < multiple_edges.isize( ); j++ ) {
	  int e = multiple_edges[j];
	  if ( multiple_edges_count[j] <= edge_copyno[e] )
	    continue;
	  if (verbose) {
	    cout << "Edge " << BaseAlpha(e) << "[" << edgelengths[e]
		 << "] placed " << multiple_edges_count[j] << " times, "
		 << "but copyno = " << edge_copyno[e] << "\n";
	  }
	}
	C.push_back(all_edges);
	continue;
      }

      for ( int j = 0; j < reach.isize( ); j++ ) {
	equiv_rel eq( reach[j].size( ) );
	for ( int u1 = 0; u1 < reach[j].isize( ); u1++ ) {
	  int e1 = reach[j][u1];
	  for ( int u2 = 0; u2 < reach[j].isize( ); u2++ ) {
	    int e2 = reach[j][u2];
	    if ( to_right_vertex[e1] == to_left_vertex[e2]
		 || to_right_vertex[e2] == to_left_vertex[e1] ) {
	      eq.Join( u1, u2 );
	    }
	  }
	}
	if ( eq.OrbitCount( ) != 1 ) {
	  if (verbose) {
	    cout << "Subcomponent " << j+1 << " is not connected:\n";
	    static vec<int> reps;
	    eq.OrbitRepsAlt(reps);
	    for ( int u = 0; u < reps.isize( ); u++ ) {
	      cout << "-";
	      static vec<int> o;
	      eq.Orbit( reps[u], o );
	      Sort(o);
	      for ( int z = 0; z < o.isize( ); z++ )
		cout << " " << BaseAlpha( reach[j][ o[z] ] );
	      cout << "\n";
	    }
	  }
	}
      }

      if (verbose) {
	cout << "To split component:\n";
	for ( int j = 0; j < reach.isize( ); j++ ) {
	  cout << "[" << j+1 << "]:";
	  for ( int u = 0; u < reach[j].isize( ); u++ ) {
	    int e = reach[j][u];
	    cout << " " << BaseAlpha(e) << "[" << h.EdgeLength(e) << "]";
	  }
	  cout << "\n";
	}
      }    

      C.append(reach);
    }

    HyperKmerPath h2( h, C );
    h = h2;

    if (verbose)
      cout << "\n";

    // Remove tiny components.
    
    h.RemoveSmallComponents(MIN_COMPONENT);

    // Clean up graph.
    CleanupHyperkmerPath(h);    

    IsomorphicComponentCheck(sub_dir, h);

    // Reverse.

    h.Reverse( );    

    cout << "\n-------------------------------------------------------------"
	 << "-------------------\n\n";
  }
}


void GlobalClean( HyperKmerPath& h, const KmerBaseBroker* kbb,
		  const KmerBaseBroker* gkbb, const String& sub_dir, 
		  const vecbasevector& reads, const vecKmerPath& unipaths,
		  const vec<read_location_short>& ulocs,
		  const vecvec<int>& ulocs_indexr, const vec<read_pairing>& pairs,
		  const vec<int>& pairs_index, const vec<int>& partner,
		  const Bool SHOW_PAIR_ALIGNS, const int MAX_SHORT_INSERT_SEP,
		  Bool verbose, const vec<tagged_rpint>& uniqdb,
		  const vecKmerPath& unipathsx, const vec<tagged_rpint>& unipathsxdb,
		  const vec<int>& predicted_copyno, const int MIN_COMPONENT,
		  const vec<read_location>& readlocs, const vec<int>& readlocs_index,
		  const Bool USE_TRUTH,
		  const String& data_dir, const Bool disambiguate_simple_loops_verbose,
		  const Bool pull_verbose, const Bool track_changes, 
		  const Bool NEW_POORLY_COVERED ) {

  cout << Date( ) << ": Begin GlobalClean" << endl;
  
  // Bail if assembly is empty.
  if ( h.EdgeObjectCount( ) == 0 ) {
    cout << "\nThe assembly is empty.  Skipping GlobalClean.\n\n";
    return;
  }

  double clock;
  
  IsomorphicComponentCheck(sub_dir, h);

  // For read and pair alignments
  vec<CompressedSeqOnHyper> csaligns;
  vec<vec<int> > csaligns_index;
  vec<IndexPair> pair_places;

  // Build initial HyperBasevector
  HyperBasevector hb( h, *kbb );
  
  // Create uniqdb20.
  vecKmerPath uniq20;
  vec<tagged_rpint> uniqdb20;
  for ( int i = 0; i < unipaths.size( ); i++ ) {
    if ( predicted_copyno[i] != 1 )
      continue;
    if ( unipathsx[i].KmerCount( ) < 20 )
      continue;
    uniq20.push_back_reserve( unipathsx[i] );
  }
  CreateDatabase( uniq20, uniqdb20 );

  // Find the various insert sizes. Helpful because we get the most power by
  // pulling apart with each insert size as a bound.
  vec<int> insert_sizes = FindInsertSizes(pairs);

  // Set up temporary directory.
  String tmp_dir = sub_dir + "/gdir";
  Mkdir777(tmp_dir);

  if (track_changes)
    TrackHkpChanges("At beginning of GlobalClean:", h, kbb, data_dir, tmp_dir);

  // Align the reads to the HyperKmerPath.
  clock = WallClockTime( );
  AlignSeqsToHyper(h, hb, reads, sub_dir, csaligns, SHOW_PAIR_ALIGNS );
  cout << TimeSince(clock) << " used aligning reads to hyper (1)" << endl;

  BuildAlignmentIndex(csaligns, reads.size(), csaligns_index);

  cout << Date( ) << ": Start of pull apart loop 1" << endl;
 
  // Attempt to pull apart overcollapsed vertices and edges, if links across 
  // them can partition the edges into distinct genomic clusters.

  PullApartByInsertSize( h, hb, csaligns,csaligns_index, pair_places, insert_sizes,
			 pairs, pairs_index, unipathsxdb, predicted_copyno,
			 reads, kbb, sub_dir, pull_verbose, SHOW_PAIR_ALIGNS );

  if (track_changes)
    TrackHkpChanges("After PullApartByInsertSize:", h, kbb, data_dir, tmp_dir);

  cout << Date( ) << ": Delete poorly covered edges" << endl;
  
  // Delete edges of HyperKmerPath that are not covered by bases
  // in a good pair.  
  clock = WallClockTime( );
  FindPoorlyCovered( h, hb, *kbb, csaligns, pair_places, reads, pairs, True, 
		     ( SHOW_PAIR_ALIGNS ? 2 : 1 ), NEW_POORLY_COVERED );
  cout << TimeSince(clock) << " spent in FindPoorlyCovered" << endl;

  IsomorphicComponentCheck(sub_dir, h);

  if (track_changes)
    TrackHkpChanges("After FindPoorlyCovered:", h, kbb, data_dir, tmp_dir);

  // Alignments that are only used for Hospital and DisambiguateSimpleLoops2
  vec<alignment_plus> Aligns;

  // Align reads to HyperKmerPath edges.
  clock = WallClockTime( );
  AlignSeqsToHyper( h, hb, reads, *gkbb, unipaths, ulocs, ulocs_indexr, 
		    sub_dir, "brun", Aligns );
  cout << TimeSince(clock) << " used aligning reads to hyper (2)" << endl;

  cout << Date( ) << ": Repair global collapsing in Hospital" << endl;

  // Try to repair global collapsing.
  clock = WallClockTime( );
  Hospital( h, reads, Aligns, kbb, partner, verbose, uniqdb );
  cout << TimeSince(clock) << " used in hospital" << endl;
  
  // Rebuild HyperBasevector
  hb = HyperBasevector( h, *kbb );

  IsomorphicComponentCheck(sub_dir, h);
  
  if (track_changes || disambiguate_simple_loops_verbose )
    TrackHkpChanges("Before disambiguating simple loops:", h, kbb, data_dir, tmp_dir);
  
  cout << Date( ) << ": Disambiguate simple loops" << endl;

  // Try to disambiguate simple loops.
  clock = WallClockTime( );
  DisambiguateSimpleLoops2( h, hb, reads, pairs, pairs_index, 
			    Aligns, disambiguate_simple_loops_verbose );
  cout << TimeSince(clock) << " used disambiguating simple loops" << endl;

  // Don't need the Aligns anymore
  Destroy(Aligns);

  if (track_changes || disambiguate_simple_loops_verbose )
    TrackHkpChanges("After disambiguating simple loops:", h, kbb, data_dir, tmp_dir);

  cout << Date( ) << ": Clean up and remove hanging end and tiny components" << endl;
  
  // Clean up graph.
  CleanupHyperkmerPath(h);

  // Remove hanging ends and tiny components.
  RemoveHangingEnds(h);
  h.RemoveSmallComponents(MIN_COMPONENT);
  
  if (track_changes)
    TrackHkpChanges("After removing small components:", h, kbb, data_dir, tmp_dir);

  // Clean up graph.
  CleanupHyperkmerPath(h);
    
  IsomorphicComponentCheck(sub_dir, h);
  
  if (track_changes)
    TrackHkpChanges("After clean up:", h, kbb, data_dir, tmp_dir);

  // Pull apart along distinct paths (as defined by pairs) that run from source
  // edges to sink edges.
  PullApartSourceToSink(h,  kbb, reads, sub_dir, csaligns, csaligns_index,
			pair_places, pairs, pairs_index, SHOW_PAIR_ALIGNS,
			verbose, uniqdb, uniqdb20, unipathsxdb,
			predicted_copyno,  MIN_COMPONENT,
			readlocs,  readlocs_index,
			USE_TRUTH, data_dir, tmp_dir, track_changes);
  
  cout << Date( ) << ": Remove hanging ends and clean up" << endl;

  // Remove hanging ends.
  RemoveHangingEnds(h);
  
  // Clean up graph.
  CleanupHyperkmerPath(h);    

  // Rebuild hyperbasevector
  hb = HyperBasevector( h, *kbb );
  
  IsomorphicComponentCheck(sub_dir, h);
  
  // Rebuild Alignments
  clock = WallClockTime( );
  AlignSeqsToHyper(h, hb, reads, sub_dir, csaligns, SHOW_PAIR_ALIGNS );
  cout << TimeSince(clock) << " used rebuilding read alignments" << endl;

  BuildAlignmentIndex(csaligns, reads.size(), csaligns_index);

  cout << Date( ) << ": Start of pull apart loop 2" << endl;

  // Second attempt to pull apart overcollapsed vertices, in case
  // the cleaning revealed anything we missed before.
  // If it has no effect, we don't rebuild, so there's little cost to trying.
  PullApartByInsertSize( h, hb, csaligns, csaligns_index, pair_places, insert_sizes,
			 pairs, pairs_index, unipathsxdb, predicted_copyno,
			 reads, kbb, sub_dir, pull_verbose, SHOW_PAIR_ALIGNS );

  if (track_changes)
    TrackHkpChanges("After PullApartByInsertSize:", h, kbb, data_dir, tmp_dir);
  else
    h.PrintSummaryPlus(cout);

  cout << Date( ) << ": Finding links between components" << endl;

  // Find links between components. (display only - doesn't change anything
  // FindLinksBetweenComponents( h,  hb, uniqdb20, csaligns, pairs, reads.size()); 

  IsomorphicComponentCheck(sub_dir, h);

  // Summarize CompressedSeqOnHyper stats.
  if ( CompressedSeqOnHyper::AmbigCount() > 0 )
    cout << "CompressedSeqOnHyper has dealt with ambiguity "
	 << CompressedSeqOnHyper::AmbigCount() << " times" << endl;
}
