/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2007) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

/**
   Program: RunAssemblyThroughUnipaths

   Take real or simulated reads, and invoke other programs to construct the unipaths.
   Specifically, this program orchestrates the following steps:

      - create a <kmer space> of kmers in the reads and (if have reference) in the reference
      - create <read paths> in this kmer space; do <error correction> on the reads, if reads are not perfect
      - create <read unipaths> approximating the <genomic unipaths>; and determine the copy number
        of the read unipaths.
  
   Error Correction (real data and CONSTRUCTION D):
   
   - optional error correction turned on if ec_rounds = 1.
   - kmer frequencies to use in error correction given in ec_ks
   - base position error probability table name used in error correction should be
     given in ERROR_TABLE_NAME. If no table is given then a flat error profile
     will be used.
   - if USE_QUALITY_SCORES is True then error quality score information is used
     to assist error correction.
   - if REMOVE_SUSPICIOUS_READS is True then reads identified as containing errors
     that can't be repaired are removed instead.
   - if EXCLUDE_PAIRED is True, then paired reads are not used when calculating the
     the kmer frequency table and strong kmer kmer table. Paired reads are still
     error corrected, but using strong kmers found in the unpaired read set only.
   - see ProvisionalEdits for more information
  
   This code leaves one at the point one would be at if the modules ReadsToPaths,
   FindHiQualKmersInReads, PathsHQ, and CloseAllReadGaps, and FindMuxes had been run.
  
   The output directory is run_dir = PRE/DATA/RUN.  It should not exist before this
   code is started, however the data directory should.
  
   Question: This program is no longer part of <simulation> -- should it be moved?

   @file
*/

#ifndef FORCE_DEBUG
     #define NDEBUG
#endif

#include "Basevector.h"
#include "Bitvector.h"
#include "MainTools.h"
#include "Feudal.h"
#include "FeudalMimic.h"
#include "math/Functions.h"
#include "KmerRecord.h"
#include "ParseSet.h"
#include "ReadLocation.h"
#include "ReadPairing.h"
#include "SortKmers.h"
#include "TokenizeString.h"
#include "KmerShape.h"
#include "paths/KmerPath.h"
#include "random/NormalRandom.h"
#include "random/Random.h"
#include "random/Shuffle.h"
#include "AlignmentProfile.h"
#include "util/LabelRandomGenerator.h"
#include "paths/simulation/ErrorGenerator.h"
#include "system/Properties.h"

void Run( const String& command, ostream& logfile, const String& logfull ) 
{
  logfile << "Running:" << endl << command << endl;
  SystemSucceed( command + " >> " + logfull + " 2>&1" );
}

/**
   Local function: BuildReadPathsAndGenomePaths

   Create a <unified kmer space> (kmer numbering) for kmers in the error-corrected reads
   and (if we have the reference) in the reference.  Create <read paths> from the corrected
   reads, and (if we have the reference) <genome paths> from the reference genome.

   Input parameters:

      PRE, DATA, RUN - the prefix, data and run dirs
      log, logfull - logfiles
      reads - the corrected reads
      K - kmer size for the kmer space in which we'll create read paths and genome paths

   Reads data files:

      genome.fastb - if haveReference && !readsArePerfect, to create a unified kmer space
         for reads and genome parts, and to create read paths and genome paths in this
	 unified kmer space
      genome_paths - if readsArePerfect, we get read paths directly from genome paths

   Writes data files:

      reads.paths.kN, reads.paths_rc.kN - the read paths, and their reverse complements
      reads.pathsdb.kN - the <read path database> of the read paths, which lets us quickly
          find all reads that contain a given kmer

*/
void BuildReadPathsAndGenomePaths( String PRE, String DATA, String RUN, ostream& log,
				   const String& logfull,
				   const vecbasevector& reads,
				   const Bool realData, const Bool readsArePerfect, const Bool haveReference,
				   const int K,
				   Bool buildGenomePathsRc,
				   String genome_paths, longlong genome_size)
{    String data_dir = PRE + "/" + DATA;
     String run_dir = PRE + "/" + DATA + "/" + RUN;

     cout << "building read paths for K=" << K << endl;

     int nreads = reads.size( );
     
     if ( !( haveReference && readsArePerfect ) ) {
       ForceAssert( haveReference && !readsArePerfect /* either real reads or simulated imperfect reads */ ||
		    !haveReference && !readsArePerfect && realData );
       if ( haveReference && !readsArePerfect ) {
	 // make paths for both genome and reads in the same numbering scheme We
	 // need a basevector (on disk) with the genome and reads concatenated
	 // (the genome comes first to facilitate good kmer numbering).  We then
	 // run <GenomeToPaths> to create kmer paths from this concatenated
	 // file.  We then take the genome paths from the first part of this
	 // concatenated file, and the read paths from the second part of this
	 // concatenated file; these read paths and genome paths are then in the
	 // same kmer space.
	 cout << "Merging genome and reads-with-errors..." << endl;
	 String genome_plus_reads = 
	   run_dir + "/genome_plus_reads.fastb";
	 vecbasevector genome( data_dir + "/genome.fastb" );
	 int ngenome = genome.size();
	 genome.Write( genome_plus_reads, 0, ngenome ); // 3-file version
	 reads.Write( genome_plus_reads, 0, nreads ); // append to 3-file version
	 Destroy( genome );
// MakeDepend: dependency GenomeToPaths
	 Run( "GenomeToPaths HEAD=" + run_dir
	      + "/genome_plus_reads K=" + ToString(K), log, logfull );
	 Remove( genome_plus_reads );  // remove all three files
	 Remove( genome_plus_reads + "..offsets" );
	 Remove( genome_plus_reads + "..static" );
	 genome_plus_reads = run_dir + "/genome_plus_reads.paths.k" + ToString(K);
	 // now separate the reads from the genome
	 // Let's take care of the genome paths first:
	 cout << "Writing out genome" << flush;
	 vecKmerPath genome_kmerpaths;
	 genome_kmerpaths.ReadRange( genome_plus_reads, 0, ngenome );
	 genome_kmerpaths.WriteAll( run_dir + "/genome.paths.k" + ToString(K) );
	 Destroy(genome_kmerpaths);
// MakeDepend: dependency MakeRcDb
         Run( "MakeRcDb PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN + " READS=genome"
              + " K=" + ToString(K) + " FORCE_BIG=True", log, logfull );
	 // Now take care of the read files:
	 // fw:
	 cout << ", paths" << flush;
	 vecKmerPath read_paths;
	 read_paths.ReadRange( genome_plus_reads, ngenome, ngenome + nreads );
	 read_paths.WriteAll( run_dir + "/reads.paths.k" + ToString(K) );	 
	 // rc:

	 cout << ", paths_rc" << flush;
	 vecKmerPath read_paths_rc;
         // We could just copy read_paths and reverse each element, but then
         // each read path would have allocated its own memory in the reverse.
         // By doing it this way, all the read paths store their data in the
         // feudal vector.
	 read_paths_rc.Reserve( read_paths.rawsize(), read_paths.size() );
	 KmerPath temp;
	 for(int i=0; i<read_paths.size(); i++) {
	   temp = read_paths[i];
	   temp.Reverse();
	   read_paths_rc.push_back(temp);
	 }
	 read_paths_rc.WriteAll( run_dir + "/reads.paths_rc.k" + ToString(K) );
	 // db:
	 cout << ", pathsdb" << flush;
	 String pathsdb_file = run_dir + "/reads.pathsdb.k" + ToString(K);
	 int fd = OpenForWrite( pathsdb_file );
	 longlong count = 0;
	 for ( int pass = 1; pass <= 2; pass++ ) {
	   const vecKmerPath& v = ( pass == 1 ? read_paths : read_paths_rc );
	   for ( int pathId = 0; pathId < v.size( ); pathId++ ) {
	     int ind = ( pass == 1 ? pathId : -pathId-1 );
	     for ( int segIdx = 0; segIdx < v[pathId].NSegments( ); segIdx++ ) {
	       const KmerPathInterval& I = v[pathId].Segment(segIdx);
	       if ( !I.isGap( ) ) {
		 ForceAssertLe( (unsigned int)I.Length( ), tagged_rpint::LENGTH_MAX );
		 ForceAssertLe( (unsigned int)segIdx,      tagged_rpint::POSITION_MAX );
		 tagged_rpint t( I.Start( ), I.Length( ), ind, segIdx );
		 WriteBytes( fd, &t, sizeof(t) );    
		 ++count;
	       }
	     }
	   }
	 }
	 close(fd);
	 Destroy(read_paths);
	 Destroy(read_paths_rc);
	 vec<tagged_rpint> segs(count);
	 fd = OpenForRead( pathsdb_file );
	 ReadBytes( fd, &segs[0], count * sizeof(tagged_rpint) );
	 close(fd);
	 Prepare(segs);
	 BinaryWrite3( pathsdb_file, segs );
	 cout << " done" << endl;
       }  // if ( haveReference && !readsArePerfect )
       else {  // if have real data but not the reference
	 
	 // if we have real data, but don't have the reference, then of course we can't
	 //  make a unified kmer space for the read paths and the genome paths -- since there is no
	 // reference from which to construct the genome paths.  In this case, we simply
	 // make read paths from the reads.
	 
// MakeDepend: dependency ReadsToPaths
	 Run( "ReadsToPaths PRE=" + PRE + " DATA=" + DATA + " RUN=" 
              + RUN + " K=" + ToString(K) + " USE_QUALITY_SCORES=False"
              + " GENOME_SIZE=" + ToString(genome_size), log, logfull );
       }
     }  // if ( !( haveReference && readsArePerfect ) )
     else  // if we do have the reference, _and_ have perfect simulated reads from that reference
       //     (and know the <truth> about each read's location on the genome)
       {
	 ForceAssert( haveReference && readsArePerfect );
	 
	 //
	 // In this case, we must have already created <genome paths> by running <PrepareGenome>.
	 // Create read paths, in terms of the genome kmer space, by taking for each read the
	 // portion of the <genome path> covered by the read.  That way, we can reuse the
	 // <run-length encoding> of the genome paths for the read paths.  This is only possible
	 // when the reads track the genome perfectly.
	 //
	 
	 // For each genome path, gather the start locations (offsets) of its <path intervals> on the path.
	 // This will let us quickly find the portion of the genome path covered by a read from that genome part.
	 vecKmerPath gpaths(genome_paths), paths;
	 longlong nintervals = 0;
	 vec< vec<genome_part_pos_t> > start( gpaths.size( ) );
	 for ( int i = 0; i < gpaths.size( ); i++ )
	   {    start[i].reserve( gpaths[i].NSegments( ) + 1 );
	   start[i].push_back(0);
	   for ( int j = 0; j < gpaths[i].NSegments( ); j++ )
	     {    start[i].push_back( start[i].back( ) 
				      + gpaths[i].Segment(j).Length( ) );    }    }
	 if( genome_paths != run_dir + "/genome.paths.k" + ToString(K) &&
	     !IsRegularFile( run_dir + "/genome.paths.k" + ToString(K) ) )
	   Symlink( genome_paths, run_dir + "/genome.paths.k" + ToString(K) );
	 
	 cout << Date( ) << ": building read paths" << endl;

	 // Load locations of reads on the reference
	 vec<read_location> loc_on_ref;
	 vec<genome_part_id_t> genomePartIds( gpaths.size() );
	 for ( genome_part_id_t genomePartId = 0; genomePartId < genomePartIds.isize(); genomePartId++ )
	   genomePartIds[genomePartId] = genomePartId;
	 ReadLocs( run_dir + "/reads.ref.locs", genomePartIds, loc_on_ref );
	 
	 for ( int pass = 1; pass <= 2; pass++ )
	   {    if ( pass == 2 ) paths.Reserve( nintervals, nreads );
	   for ( int j = 0; j < nreads; j++ )
	     {    const read_location& rl = loc_on_ref[j];
	     int m = rl.Contig( );
	     int seg1 = upper_bound( start[m].begin( ), start[m].end( ), 
				     rl.Start( ) ) - start[m].begin( ) - 1;
	     static KmerPath p;
	     p.Clear( );
	     longlong pstart = rl.Start( ) - start[m][seg1];
	     longlong kmers = rl.LengthOfRead( ) - K + 1;
	     int seg = seg1;
	     const KmerPath& gp = gpaths[m];
	     while( kmers > 0 )
	       {    longlong kmers_to_use = Min( kmers, 
						 (longlong) gp.Segment(seg).Length( ) - pstart );
	       kmer_id_t from = gp.Segment(seg).Start( ) + pstart;
	       kmer_id_t to = from + kmers_to_use - (longlong) 1;
	       p.AddSegment( KmerPathInterval( from, to ) );
	       kmers -= kmers_to_use;
	       pstart = 0;
	       seg++;    }
	     if ( rl.Rc( ) ) p.Reverse( );
	     if ( pass == 1 ) nintervals += p.NSegments( );
	     else paths.push_back(p);    }    }
	 cout << Date( ) << ": packaging read paths" << endl;
	 paths.WriteAll( run_dir + "/reads.paths.k" + ToString(K) );
	 vec<KmerPath> paths_rc;
	 longlong nsegments_rc = 0;
	 for ( int i = 0; i < nreads; i++ )
	   {    static KmerPath p;
	   p = paths[i];
	   p.Reverse( );
	   paths_rc.push_back(p);
	   nsegments_rc += p.NSegments( );    }
	 vecKmerPath pathsv_rc;
	 pathsv_rc.Reserve( nsegments_rc, nreads );
	 for ( int i = 0; i < nreads; i++ )
	   pathsv_rc.push_back( paths_rc[i] );
	 pathsv_rc.WriteAll( run_dir + "/reads.paths_rc.k" + ToString(K) );
	 Destroy(paths_rc);
	 Remove( run_dir + "/reads.pathsdb.k" );
	 int fd = OpenForWrite( run_dir + "/reads.pathsdb.k" + ToString(K) );
	 longlong count = 0;
	 for ( int pass = 1; pass <= 2; pass++ )
	   {    const vecKmerPath& v = ( pass == 1 ? paths : pathsv_rc );
	   for ( int i = 0; i < v.size( ); i++ )
	     {    int ind = ( pass == 1 ? i : -i-1 );
	     for ( int j = 0; j < v[i].NSegments( ); j++ )
	       {    const KmerPathInterval& I = v[i].Segment(j);
	       if ( !I.isGap( ) )
		 {    ForceAssertLe( I.Length( ), 32767 );
		 ForceAssertLe( j, 32767 );
		 tagged_rpint t( I.Start( ), I.Length( ), ind, j );
		 WriteBytes( fd, &t, sizeof(t) );    
		 ++count;    }    }    }    }
	 close(fd);
	 Destroy(paths), Destroy(pathsv_rc);
	 vec<tagged_rpint> segs(count);
	 fd = OpenForRead( run_dir + "/reads.pathsdb.k" + ToString(K) );
	 ReadBytes( fd, &segs[0], count * sizeof(tagged_rpint) );
	 close(fd);
	 Prepare(segs);
	 BinaryWrite3( run_dir + "/reads.pathsdb.k" + ToString(K), segs );
       } // if (haveReference && readsArePerfect)
     
     if ( buildGenomePathsRc  &&  IsRegularFile( run_dir + "/genome.paths.k" + ToString(K) ) ) {
// MakeDepend: dependency MakeRcDb
       Run( "MakeRcDb PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN +
	    " K=" + ToString(K) + " READS=genome", log, logfull );
     }
}   // BuildReadPathsAndGenomePaths()



int main( int argc, char *argv[] )
{
     RunTime( );

     BeginCommandArguments;
     CommandArgument_String(PRE);
     CommandArgument_String(DATA);
     CommandArgument_String(RUN);
     CommandArgument_Int(K);
     CommandArgument_Bool_OrDefault(GC_BIASED, False);
     CommandArgument_String_OrDefault(ec_ks, "");
     CommandArgument_String_OrDefault_Doc(PATH_KS, "", "additional kmer sizes for which read paths should be created");
     CommandArgument_UnsignedInt_OrDefault(ec_rounds, 0);
     CommandArgument_Bool_OrDefault(REMOVE_SUSPICIOUS_READS, False );
     CommandArgument_Bool_OrDefault(REMOVE_UNBUILDABLE_READS, False );
     CommandArgument_Bool_OrDefault(GENOMIC_AS_STRONG, False );
     CommandArgument_Bool_OrDefault(FIND_MUXES, False);
     CommandArgument_UnsignedInt_OrDefault(SEED, 0);
     CommandArgument_Bool_OrDefault(RECOVER_GAPS, False);
     CommandArgument_Bool_OrDefault(SUSPICIOUS_VERBOSE, False);
     CommandArgument_Bool_OrDefault(EXCLUDE_PAIRED, False);
     CommandArgument_String_OrDefault(ERROR_TABLE_NAME, "");
     CommandArgument_Bool_OrDefault(USE_QUALITY_SCORES, False);
     CommandArgument_Int_OrDefault(MAX_ERRORS, 1);
     CommandArgument_Int_OrDefault(MAX_ERROR_ENTRIES, 0);
     CommandArgument_Int_OrDefault(ERROR_PROB_CUTOFF, 10);
     CommandArgument_Bool_OrDefault(ERROR_PEEK_AHEAD, True);
     CommandArgument_Bool_OrDefault(FIX_NEAR_UNIPATH_ERRORS, False);
     CommandArgument_String_OrDefault(PROTECT_GOOD_READS_STATE, "off");
     CommandArgument_Bool_OrDefault(TRANSITIVE_FILL_IN, True);

     EndCommandArguments;

     const int hn = 20000; // huge read size

     ///////////////////////////////////////////////////////////
     //
     // Step: Sanity-check the environment and the parameters
     //
     ///////////////////////////////////////////////////////////

     // Define directories.

     ForceAssert( DATA != "" );
     ForceAssert( RUN != "" );
     const String data_dir = PRE + "/" + DATA;
     const String run_dir = PRE + "/" + DATA + "/" + RUN;

     if ( !IsDirectory(data_dir) )
     {    cout << "The DATA directory does not exist.\n";
          cout << "DATA directory = " << data_dir << "\n";
          exit(1);    }
     
     if ( !IsDirectory(run_dir) )
       {    cout << "The RUN directory does not exist.\n";
       cout << "RUN directory = " << run_dir << "\n";
       exit(1);    }
     
     // Check for existence of the files on which we depend
     
     ForceAssert( IsRegularFile( run_dir + "/reads.fastb" ) );
     ForceAssert( IsRegularFile( run_dir + "/reads.pairto" ) );
     ForceAssert( IsRegularFile( run_dir + "/reads.props" ) );
     ForceAssert( IsRegularFile( data_dir + "/genome.size" ) );

     if ( ! IsRegularFile( run_dir + "/reads.lengths" ) ) {
       vecbasevector reads( run_dir + "/reads.fastb" );
       vec<int> lens( reads.size() );
       for ( int i = 0; i < reads.size(); ++i )
         lens[i] = reads[i].size();
       BinaryWrite2( run_dir + "/reads.lengths", lens );
     }

     
     // Load basic properties of the reads
     
     Properties readProperties( run_dir + "/reads.props" );
     String CONSTRUCTION = readProperties["CONSTRUCTION"];
     Bool realdata = (CONSTRUCTION == "");
     Bool readsArePerfect = ( readProperties["PERFECT_READS"] == "True" );
     Bool haveReference = IsRegularFile( data_dir + "/genome.fastb" );
     
     // Validate those parameters which we can without doing anything complicated 
     // first:
     
     // General
     
     if ( REMOVE_UNBUILDABLE_READS && ! REMOVE_SUSPICIOUS_READS ) {
       InputErr( "REMOVE_UNBUILDABLE_READS=True requires REMOVE_SUSPICIOUS_READS=True" );
     }
     
     ForceAssert( CONSTRUCTION == "A" || CONSTRUCTION == "B" 
		  || CONSTRUCTION == "C" || CONSTRUCTION == "D" || CONSTRUCTION == "");
     
     ForceAssertSupportedK( K );
     if (ec_ks != "") {
       vec<KmerShapeId> ec_kshapes;
       ParseKmerShapeIdSet(ec_ks, ec_kshapes);
       ForceAssertSupportedKShapes(ec_kshapes);
     }
     
     
     // Determine ploidy.
     ForceAssert( IsRegularFile( data_dir + "/ploidy" ) );
     int PLOIDY = FirstLineOfFile( data_dir + "/ploidy" ).Int( );
     ForceAssert( PLOIDY == 1 || PLOIDY == 2 );

     // Set up timer for the assembly part of this process.
     double assembly_clock = 0.0;

     // Set random seed.
     if ( SEED != 0 ) srandomx(SEED);

     //
     // If we have the reference, and the reads are perfect, we can simplify
     // the creation of the kmer space: the reads consist entirely of kmers
     // in the genome.  So, when creating the <unified kmer space> of the reads
     // and the genome, we can just take the kmers of the genome.
     //
     // Of course, only simulated reads can be perfect, and only if read errors
     // were not simulated.
     //

     String genome_paths;
     if ( haveReference && readsArePerfect ) {
       genome_paths = data_dir + "/genome.paths.k" + ToString(K);
       // The genome paths must have been already produced, by the program <PrepareGenome>.
       ForceAssert( IsRegularFile(genome_paths) );
     }

     longlong genome_size = StringOfFile( data_dir + "/genome.size", 1 ).Int( );
     ForceAssert( genome_size > 0 );

     // Start Logging
     Ofstream( log, run_dir + "/RunAssemblyThroughUnipaths.log" );
     String logfull = run_dir + "/RunAssemblyThroughUnipaths.log.full";
     Remove(logfull);
     cout << "logging to\n" << run_dir + "/RunAssemblyThroughUnipaths.log"
          << "\nand " << run_dir + "/RunAssemblyThroughUnipaths.log.full\n" << endl;
     command.PrintTheCommandPretty( log, CSTRING_FROM_DEFINE( MAKE_DATE ) );

     vecbasevector reads;

     //////////////////////////////////////////////////////////////////////////////
     //
     // Step: Perform Error Correction
     //
     // Try to correct errors in reads, and throw out reads that we know have errors
     // but don't know how to fix.
     //
     // Important to do this before creating <read paths>, since error correction
     // greatly reduces the number of kmers and greatly increasing the sharing of
     // <kmer path intervals> between read paths, leading to much more compact/compressed
     // read paths.
     //
     // Only real reads, or simulated imperfect reads that come from <Construction D>,
     // require error correction.
     // 
     //////////////////////////////////////////////////////////////////////////////

     if ( ec_rounds > 0 ) {
       ForceAssert( !readsArePerfect );
       assembly_clock += WallClockTime( );
       String qualb_file = (USE_QUALITY_SCORES ? "reads.qualb" : "");

       // make a copy of the raw, uncorrected reads; the corrected versions
       // will then be put into reads.fastb as they are created.
       Cp2( run_dir + "/reads.fastb", run_dir + "/reads.fastb.orig" );

       // Protect good reads: designate some reads as correct, which is a way
       // to prevent "error correction" from introducing errors into the reads.
       
// MakeDepend: dependency ProtectGoodReads
       Run( "ProtectGoodReads PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
	    + " USE_TRUTH=" + ( realdata ? "False" : "True" ) 
	    + " STATE=" + PROTECT_GOOD_READS_STATE,
	    log, logfull );

       // Determine which kmers are genomic.  We can then try to error-correct the reads
       // by finding instances of non-genomic kmers in the reads, and mutating the reads to
       // make all kmers in each read genomic.  Reads for which _small_ correcting mutations
       // cannot be found, we will discard (but this requires us to have the <read paths>
       // already, so we'll do that after we <Create kmer space and read paths>.
       
       if ( GENOMIC_AS_STRONG ) {

	 // Cheat by taking the known reference and getting the true set of genomic kmers from it.
	 // This tests the best-case performance of our methods but is obviously unavailable for real runs.
	 
// MakeDepend: dependency FindGenomicKmers
         Run( "FindGenomicKmers PRE=" + PRE + " DATA=" + DATA + " K='" + ec_ks + "'",
              log, logfull );
         vec<int> Ks;
         ParseIntSet( ec_ks, Ks );
         for ( int i = 0; i < Ks.isize(); ++i ) {
           int K = Ks[i];
           Cp2( data_dir + "/genome.fastb.freq_table.k" + ToString(K),
                run_dir + "/reads.fastb.orig.strong.k" + ToString(K) );
         }
       }
       else {
	 // The normal flow: decide which kmers are  genomic by looking at how many times
	 // a kmer occurs in the reads.  If a kmer really occurs somewhere in the genome,
	 // with sufficiently high coverage at least some reads will land on it and read it
	 // correctly.
	 
// MakeDepend: dependency FindStrongKmers
         Run( "FindStrongKmers PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
              + " K='" + ec_ks + "'"
              + " READS_IN=reads.fastb.orig"
              + " GC_BIASED=" + ( GC_BIASED ? "True" : "False" )
              + " EXCLUDE_PAIRED=" + ( EXCLUDE_PAIRED ? "True" : "False" ) 
              , log, logfull );
       }
// MakeDepend: dependency ProvisionalEdits
       Run( "ProvisionalEdits PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
	    + " K='" + ec_ks + "'"
	    + " READS_IN=reads.fastb.orig READS_OUT=reads.fastb"
	    + " QUALS_IN=" + qualb_file
	    + " ERROR_TABLE_NAME=" + ERROR_TABLE_NAME
	    + " MAX_ERRORS=" + ToString(MAX_ERRORS)
	    + " MAX_ERROR_ENTRIES=" + ToString(MAX_ERROR_ENTRIES)
	    + " ERROR_PROB_CUTOFF=" + ToString(ERROR_PROB_CUTOFF)
	    + " ERROR_PEEK_AHEAD=" + ( ERROR_PEEK_AHEAD ? "True" : "False" )
	    , log, logfull );
       assembly_clock -= WallClockTime( );
     }
     // Import corrected reads if we did error correction, or the original reads
     // if we didn't.
     reads.ReadAll( run_dir + "/reads.fastb" );

     //
     // Live variables at this point:
     //
     //    reads - the error-corrected reads
     //

     //////////////////////////////////////////////////////////////////////////////
     //
     // Step: Create kmer space and read paths
     //
     // Assign <kmer numbers> to all kmers in the reads and (if we have the reference)
     // in the reference, creating a unified kmer space for the reads and the reference.
     // Note that this is after error correction, so many of the non-genomic kmers
     // in the raw reads have been removed at this point.
     //
     // Create <read paths> and <genome paths>, representing the reads and (if we have it)
     // the reference as sequences of kmers rather than of bases, with the kmer sequences
     // represented in a compressed way using kmer ranges whenever possible.
     //
     // See <ReadsToPaths> and <GenomeToPaths>.
     // 
     //////////////////////////////////////////////////////////////////////////////


     vec<nbases_t> pathKs;
     ParseIntSet( PATH_KS, pathKs );
     for (int i = -1; i < pathKs.isize(); i++ ) 
       BuildReadPathsAndGenomePaths( PRE, DATA, RUN, log, logfull, reads,
				     realdata, readsArePerfect, haveReference,
				     i < 0 ? K : pathKs[i],
				     i < 0, /* only build genome paths for the main K */
				     genome_paths, genome_size );


     // To save memory, kill some variables that aren't needed anymore.  Also 
     // destroy reads, which we'll bring back in later.

     Destroy(reads);

     // Remove Suspicious Reads (if reads are imperfect)

    if( REMOVE_SUSPICIOUS_READS ) {
      ForceAssert( !readsArePerfect );
      assembly_clock += WallClockTime( );

// MakeDepend: dependency RemoveSuspiciousReads
      Run( "RemoveSuspiciousReads PRE=" + PRE
	   + " DATA=" + DATA + " RUN=" + RUN
	   + " K=" + ToString(K)
	   + " PATH_KS='" + PATH_KS + "'"
	   + " LOW_FREQ_K='" + ec_ks + "'"
	   + " RECOVER_GAPS=" + ( RECOVER_GAPS ? "True" : "False" )
           + " REMOVE_UNBUILDABLE=" + ( REMOVE_UNBUILDABLE_READS ? "True" : "False" )
	   + " FIX_NEAR_UNIPATH_ERRORS=" 
	   + ( FIX_NEAR_UNIPATH_ERRORS ? "True" : "False" )
	   + " PLOIDY=" + ToString(PLOIDY)
	   + " VERBOSE=" + ( SUSPICIOUS_VERBOSE ? "True" : "False" )
	   + " SIM=" + ( realdata ? "False" : "True" ) , log, logfull );
      assembly_clock -= WallClockTime( );
    }

     // Call all kmers high-quality.  Run equivalent of PathsHQ.
     
     vecbasevector reads2( run_dir + "/reads.fastb" ); // bring reads back
     longlong nreads = reads2.size();

     assembly_clock += WallClockTime( );
     longlong totalKmers = 0;
     vec<int> kmersPerRead(nreads);
     for ( int id = 0; id < nreads; ++id )
     {    int numKmers = Max( 0, reads2[id].isize( ) - K + 1 );
          totalKmers += ( kmersPerRead[id] = numKmers );    }
     vecbitvector isHiQualKmer;
     isHiQualKmer.Reserve( totalKmers/32 + nreads, nreads );
     for ( int id = 0; id < nreads; ++id )
     {    bitvector b( kmersPerRead[id] );
          b.Set( 0, b.size( ), True );
          isHiQualKmer.push_back(b);    }
     isHiQualKmer.WriteAll( run_dir + "/reads.hqkmers.k" + ToString(K) );
     if ( !IsRegularFile( run_dir + "/reads.pathshq.k" + ToString(K) ) ) {
       Symlink( "reads.paths.k" + ToString(K),
		run_dir + "/reads.pathshq.k" + ToString(K) );
       Symlink( "reads.paths_rc.k" + ToString(K),
		run_dir + "/reads.pathshq_rc.k" + ToString(K) );
       Symlink( "reads.pathsdb.k" + ToString(K),
		run_dir + "/reads.pathshqdb.k" + ToString(K) );
     }
     vec<int> left_trim(nreads, 0), right_trim(nreads, 0);
     BinaryWrite2( run_dir + "/reads.paths.left_trim.k" + ToString(K), left_trim );
     BinaryWrite2( run_dir + "/reads.paths.right_trim.k" + ToString(K), right_trim );
// MakeDepend: dependency Unipather
     Run( "Unipather PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
          + " K=" + ToString(K) + " SIM=" + ( realdata ? "False" : "True" ), 
          log, logfull );
     String error_rate;
     if( ! readsArePerfect &&  readProperties.IsDefined( "AVG_MUTATION_RATE" ) ) 
       error_rate = String(" ERROR_RATE=") + readProperties["AVG_MUTATION_RATE"];
// MakeDepend: dependency UnipathCoverage
     Run( "UnipathCoverage PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
          + " PLOIDY=" + ToString(PLOIDY) + " K=" + ToString(K) + error_rate, 
          log, logfull );
// MakeDepend: dependency UnipathLocs
     Run( "UnipathLocs PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
          + " K=" + ToString(K), log, logfull );
     cout << "Running PathsToLocs... " << flush;
     if (!realdata)
// MakeDepend: dependency PathsToLocs
     {   Run( "PathsToLocs PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
	      + " K=" + ToString(K) + " SAVE_PLACEMENTS_TO=" + PRE + "/" + DATA + "/"
	      + RUN + "/reads.unipaths.placements" + " PATHS=" + PRE + "/" + DATA 
              + "/" + RUN + "/reads.unipaths.k" + ToString(K) + " > " + PRE + "/" 
              + DATA + "/" + RUN + "/reads.unipaths.mosaic", log, logfull );
	 SystemSucceed( "PathsToLocs PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
	      + " K=" + ToString(K) + " PATHS=" + PRE + "/" + DATA + "/" + RUN 
	      + "/reads.unipaths.k" + ToString(K) + " > " + PRE + "/" + DATA 
	      + "/" + RUN + "/PathsToLocs.out" );
	 cout << "done." << endl;     }
// MakeDepend: dependency BuildUnipathLinkGraphs
     Run( "BuildUnipathLinkGraphs PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
          + " TRANSITIVE_FILL_IN=" + ( TRANSITIVE_FILL_IN ? "True" : "False" )
          + " K=" + ToString(K), log, logfull );

     // Generate muxes.

// MakeDepend: dependency FindMuxes
// MakeDepend: dependency FindSubsumptions
     if (FIND_MUXES)
     {    Run( "FindMuxes PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
               + " K=" + ToString(K) + " minNumKmers=1", log, logfull );
          Run( "FindSubsumptions PRE=" + PRE + " DATA=" + DATA + " RUN=" + RUN
               + " K=" + ToString(K) + " minNumKmers=1", log, logfull );    }

     // Report time used on assembly stuff.

     cout << TimeSince(assembly_clock) 
          << " used for initial part of assembly" << endl;
     log << TimeSince(assembly_clock) 
          << " used for initial part of assembly" << endl;    }
