/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2007) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

/**
   Program: SimulateReads

   Given a known genome, creates simulated reads from that genome.

   Prepares input for <RunAssemblyThroughUnipaths>.  For real reads,
   <SetupRealReadsForAssembly> puts real reads into the same format
   output for simulated reads by this program.

   Reads data files (in the <data dir>):

      genome.fastb - the genome from which simulate the reads
         Note that the genome may consist of several disconnected parts (e.g. chromosomes).
      ploidy - says whether the genome is diploid or haploid; 

   Creates data files (in the <run dir>):

      reads.fastb - the simulated reads
      reads.lengths - the length of each read
      reads.error_count - how many errors in each simulated read (only for reads
          produced using an error generator)
      reads.true.fastb - the error-free version of each read
      reads.ref.locs - where on the reference each simulated read comes from
      reads.qualb - the simulated quality scores (only reads produced using an
          error generator)
      reads.ref.locs - for each read, the location on the reference.
          (From it and <genome.fastb> the contents of <reads.true.fastb> could be
	  reconstructed).
      reads.pairto, reads.pairtob, reads.pairto_index - for paired simulated reads, the
         read pairings (pairs of reads in reads.fastb between which the approximate
	 distance is known).
      reads.props - a <Properties> file giving some properties of the reads: which
         construction was used, whether the reads have errors, etc.  
      genome.size - the size of the genome (the sum of sizes of its parts).

   Read <Properties> written to reads.props:

        CONSTRUCTION - name of the construction used to generate simualted reads (A,B,C,D)
	PERFECT_READS - "True" if no read errors were simulated, "False" if read errors were simulated.
   
   Derived from <SeqToPaths>.

   There are four possible construction methods (methods for generating
   simulated reads from a genome), A, B, C or D as described
   below.
  
   Only construction D is currently used and supported (04-17-07).
   Construction B is used in the paper so it should also work (06-07-07).

   Constructions A,B,C generate perfect reads, while construction
   D simulates errors in reads.

   In the description below, there are two common parameters:

      n - the number of simulated reads to generate
      N - the fragment size, for paired reads.
  
   CONSTRUCTION=A:
  
   - chop G into n-base perfect reads, shifting by one base to get the next read;
   - for each two reads whose span is exactly N bases, generate a read pair whose
     separation is N-2n and whose standard deviation is dev, then randomly delete
     half the pairs so that no read lies in two pairs;
   - chop G into 20kb chunks, treat as extra reads (temporary, see below) - this
     facilitates good k-mer numbering;
   - build k-mer paths for reads, then discard the 20kb reads.
  
   CONSTRUCTION=B (intended as a memory-efficient replacement for A):
  
   - find the k-mers of G which appear more than once in G (or its reverse
     complement);
   - find the maximal subsequences of G which do not contain duplicate k-mers in
     their interiors, and make then into reads if they have length >= n;
   - every subsequence of length n which contains a duplicate k-mer (and not 
     subsumed by a maximal subsequence) is made into a read;
   - at random create npairs read pairs, consisting of n-base perfect reads from
     N-base segments on G; these are assigned insert parameters (N-2n, dev);
   - build k-mer paths for the reads.
   For construction B, we require that a paths file for the genome has been created
   using GenomeToPaths.
  
   CONSTRUCTION=C (intended as a control for B):
  
   - chop G into n-base perfect reads, shifting by one base to get the next read;
   - at random create npairs read pairs, consisting of n-base perfect reads from
     N-base segments on G; these are assigned insert parameters (N-2n, dev);
   - build k-mer paths for the reads.
  
   CONSTRUCTION=D:
  
   There are four parameters: n, the read size (all simulated reads will have this
   exact size); N, the <fragment> size; C, the coverage, defined as
   C = ((# of reads) x (read size n) / (genome size G)); and dev, the allowed variation
   in fragment size N (dev=10% meaning 10% of N).  This is for generating paired reads;
   for generating unpaired reads, you specify only the read length n and the coverage
   C; see below for details.  See <CreateRandomPairs()> and <CreateRandomReads()> 
   for implementation.
  
   - the argument LIBRARIES should have the form n=n1,N=N1,dev=dev1,C=C1 or
     n=n1,N=N1,dev=dev1,C=C1:n=n2,N=N2,dev=dev2,C=C2 etc., specifying one, two, 
     or more libraries;
   - we allow e.g. dev=10%, meaning 10% of N;
   - if any of the variables n, N, dev, or C is unspecified, but was specified for a
     previous library, inherit that value, inheriting percentages as percentages 
     rather than in terms of the N it was specified with before;
   - we also allow a library i to be specified by n=ni,C=Ci
     which causes unpaired reads of length ni to be created at random to coverage Ci
     (but BEWARE of the preceding rule);
   - for each paired-end library, at random create read pairs at sequence coverage 
     C, consisting of n-base imperfect reads from (N+/-dev)-base segments on G, with
     substitutions introduced at rate MUTATION_PERCENT or using an error template
     specified with ERROR_GENERATOR_NAME; these are assigned insert parameters (N-2n, dev);
   - use either MUTATION_PERCENT or ERROR_GENERATOR_NAME, not both at the same time.
   - if neither MUTATION_PERCENT or ERROR_GENERATOR_NAME are specified then reads
     with no errors are generated.
   - The arg MUTATION_PERCENT also accepts the name of a file (preceded by "@"
     containing per-position error rates.
   - Optional error correction.
   - build k-mer paths for the reads.
   - if MUTATION_PERCENT=0, we require that a paths file for the genome has been
     created using GenomeToPaths.
  
   In A, B, C, and D:
   - assign quality 50 to each base;
   - force all k-mers to be high-quality.
  
   The output directory is run_dir = PRE/DATA/RUN.  It should not exist before this
   code is started, however the data directory should.
  
   OPTIONS:
   - SEED=x will set the random seed to x.  Note that this only applies to some of
   the random numbers which are generated.
  
   NOTE:
   The run directory must contain a file "ploidy" having only one line: either
   "1" or "2".  If ploidy = 2, coverage levels are all divided by two.
  
   NOTE:
   CONSTRUCTIONs A, B and C are outdated and may no longer work correctly.

   About: diploid genome

   If you have a haploid genome, you can simulate a second haplotype and thus
   get a simulated diploid genome by using the program <MutateReference>.

   @file
*/

#ifndef FORCE_DEBUG
     #define NDEBUG
#endif

#include "Basevector.h"
#include "Bitvector.h"
#include "MainTools.h"
#include "Feudal.h"
#include "FeudalMimic.h"
#include "math/Functions.h"
#include "KmerRecord.h"
#include "ParseSet.h"
#include "ReadLocation.h"
#include "ReadPairing.h"
#include "SortKmers.h"
#include "TokenizeString.h"
#include "KmerShape.h"
#include "paths/KmerPath.h"
#include "random/NormalRandom.h"
#include "random/Random.h"
#include "random/Shuffle.h"
#include "system/Properties.h"
#include "AlignmentProfile.h"
#include "util/LabelRandomGenerator.h"
#include "paths/simulation/ErrorGenerator.h"

// Simulated read error method
enum methodType { NONE, MUTATION_RATES, ERROR_GENERATOR };

void Run( const String& command, ostream& logfile, const String& logfull ) 
{
  logfile << "Running:" << endl << command << endl;
  SystemSucceed( command + " >> " + logfull + " 2>&1" );
}

/**
   Local function: GetMutsPerMil
  
   Generates a table of mutation rates (per million bases) for each base position
   in a simulated read from an error profile.
  
   Input parameters:
  
      n - read length;
      mutation_rates - either a table of per base mutation rates for each position
        in a read, or a single valued table whose sole entry is the base mutation
	rate for all positions in a read. Know as the error profile.
  
   Output parameters:

      muts_per_mil - table of mutation rates (per million bases) for each base
        position in a read.
*/  
void GetMutsPerMil(const int n, const vec<double>& mutation_rates, vec<int>& muts_per_mil) {
  if ( mutation_rates.size() == 1 ) {
    muts_per_mil.resize( n );
    for ( int m = 0; m < n; ++m )
      muts_per_mil[m] = int( floor( mutation_rates.front() * 1000000.0 ) );
  } else {
    muts_per_mil.resize( mutation_rates.size() );
    for ( unsigned int m = 0; m < mutation_rates.size(); ++m )
      muts_per_mil[m] = int( floor( mutation_rates[m] * 1000000.0 ) );
  }
}

/**
   Local function: ApplyMutationRates
  
   Applies muts_per_mil table (derived from mutation rates) to a basevector
   that represents a pefect simulated read. The result is a read which may
   contain errors.
  
   Input parameters:
  
      b - read to mutate
      muts_per_mil - table of mutation rates (per million bases) for each base
        position in a read.
  
   Output parameters:
      b - mutated read

*/  
void ApplyMutationRates(basevector& b, const vec<int>& muts_per_mil) {
  // Deliberately use a DIFFERENT random number generator, to
  // facilitate comparisons between projects computed with 
  // and without mutations.
  for ( int u = 0; u < b.isize( ); u++ )
    if ( rand( ) % 1000000 < muts_per_mil[u] ) 
      b.Set( u, ( b[u] + (rand( ) % 3) + 1 ) % 4 );
}

/**
   Local function: CalculateGenomeBases
  
   Give a genome and a read or insert size, this function will calculate
   the number of distinct positions in the genome a simulated insert or read
   can be derived from. In other words, the number of possible start positions
   in the genome for a read or insert of the specified size.

   Input parameters:
  
     genome - the genome to use
     nbases - size of insert or read (in terms of bases on the reference)
  
   Output parameters:
     returns the number of genome bases as defined above

*/  
longlong CalculateGenomeBases(const vecbasevector& genome, const int nbases) {
  longlong gbases = 0;
  for ( int i = 0; i < genome.size( ); i++ ) {
    const int gsize = genome[i].isize();
    if ( gsize < nbases ) continue;
    gbases += gsize - nbases + 1;
  }
  return gbases;
}

/**
   Local function: FindContigAndUpdatePosition

   Related to <CalculateGenomeBases>, this function will find the contig and
   position on the contig, given a position on the genome. The position in the
   genome is defined in terms of the number of distinct positions in the genome
   a simulated insert or read can be derived from - as calculated in
   <CalculateGenomeBases>.

   Input parameters:
  
     genome - the genome to use
     nbases - size of insert or read (in terms of bases on the reference)
     pos - a position on the geneome
  
   Output parameters:
     returns the contig index where pos lies
     pos - the corresponding position on the contig

*/  
genome_part_id_t FindContigAndUpdatePosition(const vecbasevector& genome, const int nbases, longlong& pos) {
  for ( int i = 0; i < genome.size( ); i++ ) {
    const int gsize = genome[i].isize();
    if ( gsize < nbases ) continue;
    if ( pos > gsize - nbases ) 
      pos -= ( gsize - nbases + 1 );
    else {
      return i;
    }
  }
  FatalErr("Read start position in genome not valid");
}


/**
   Local function: CreateRandomPairs
  
   Generate simulated paired reads from a genome.
   If npairs is large relative to the genome size,
   this will be slow.
  
   Input parameters:
  
      genome - the known genome from which the simulated reads are generated
      npairs - number of simulated read pairs to generate
      n - read length; all simulated reads will have this exact length.
      N - fragment length
      dev - the stddev for the variation in fragment size N (in the absolute number of bases,
            not as percentage of N);
            only applied if apply_dev is True (otherwise all fragment sizes are the same
            and all paired reads have the same distance between them).  this simulates
            the fact that in real <libraries>, there is variation in fragment sizes
            (although the laboratory procedures are tuned to try to minimize this variation).
      apply_dev - whether the model variation in fragment sizes, or to make all simulated
         fragments the same size.
      method - method to use to simulate reads with errors (default NONE)
  
   Output parameters:
  
      reads - the simulated reads.  note that these simulated reads will have
         artificially generated errors; true_reads gives the actual, error-free original
         simulated reads.
      true_reads - the true original simulated reads, before any errors are introduced.
         only generated if `generate_true_reads' is true.  of course, only for
         simulated reads can we know this with certainty.
      generate_true_reads - whether to generate true reads
      quals - qualities of the simulated reads
      pairs - the pairings of the simulated reads, showing for each simulated read pair
         which two reads in _reads_ comprise that pair
      loc_on_ref - the true location of each read on the genome (of course, only for
         simulated reads can we know this with certainty.)
  
   Shares code with <CreateRandomReads()>.
*/  
void CreateRandomPairs( int npairs, vec<read_pairing>& pairs, 
     vec<read_location>& loc_on_ref, const vecbasevector& genome, 
     vecbasevector& reads,  vecqualvector& quals, vecbasevector& true_reads,
     int N, int n, double dev, 
     Bool apply_dev = False,
     methodType method = NONE,
     const vec<double>* mutation_rates_ptr = 0,
     const ErrorGenerator* egen_ptr = 0,
     Bool generate_true_reads = False,
     vec<int>* error_count_ptr = 0)
{
  cout << "Creating " << npairs << " pair reads for inserts of length " 
       << N << " +/- " << dev << " ";
  if ( ERROR_GENERATOR == method)
    cout << "using error generator." << endl;
  else if ( MUTATION_RATES == method)
    cout << "using mutation rates." << endl;
  else if ( NONE == method )
    cout << "without errors." << endl;

  // Insert deviation distribution
  NormalRandom normal_rv( 0, dev );
  
  vec<int> muts_per_mil;
  if (MUTATION_RATES == method)
    GetMutsPerMil(n, *mutation_rates_ptr, muts_per_mil);

  loc_on_ref.reserve( 2 * npairs );

  // Generate each read pair in turn
  for ( int i = 0; i < npairs; i++ ) {
    int insert_length = N;
    if (apply_dev) insert_length += int( round( normal_rv.value( ) ) );
    
    // Determine number of bases in genome (posrange) an insert can start from given
    // the insert size (including deviation)
    longlong posrange = CalculateGenomeBases(genome, insert_length);

    // Pick random position in entire genome for start of insert
    longlong pos = big_random( ) % posrange;
    
    // Find contig and position on contig for start of insert
    genome_part_id_t contig = FindContigAndUpdatePosition(genome, insert_length, pos);

    
    const basevector& g = genome[contig];
    
    static basevector b1, b2;
    b1.SetToSubOf( g, pos, n );
    b2.SetToSubOf( g, pos + insert_length - n, n );
    b2.ReverseComplement( );
    if (generate_true_reads) {
      true_reads.push_back(b1);
      true_reads.push_back(b2);    
    }

    // Use error generator to create simulated reads with errors
    if ( ERROR_GENERATOR == method ) {
      AlignmentProfile profile1, profile2;
      qualvector qual1, qual2;
      egen_ptr->GetErrorProfile(profile1, qual1, n);
      egen_ptr->GetErrorProfile(profile2, qual2, n);
      profile1.CreateRead(g, pos, b1);
      profile2.CreateRcRead(g, pos + insert_length - 1, b2);
      quals.push_back_reserve(qual1);
      quals.push_back_reserve(qual2);
      error_count_ptr->push_back(profile1.GetErrorCount());
      error_count_ptr->push_back(profile2.GetErrorCount());
    }		 

    // Use mutation rate or rate table create simulated reads with errors
    if ( MUTATION_RATES == method ) {
      ApplyMutationRates(b1, muts_per_mil);
      ApplyMutationRates(b2, muts_per_mil);
    }

    loc_on_ref.push_back( read_location( reads.size( ), n, contig,
					 pos, ForwardOr, g.size( ) ) );
    loc_on_ref.push_back( read_location( reads.size( ) + 1, n, contig,
					 pos + insert_length - n, ReverseOr, g.size( ) ) );
    read_pairing p(reads.size() , reads.size() + 1,  N - 2*n, int(round(dev)));
    p.t = other;
    p.weight = 1;
    pairs.push_back(p);

    reads.push_back(b1);
    reads.push_back(b2);

    if ( i > 0 && ( i % (npairs/100) == 0 ) )
      Dot( cout, i / (npairs/100) - 1 );
  }
  cout << endl;    
}


/**
   Local function: CreateRandomReads

   Create simulated unpaired reads from a genome.

   Input parameters:
  
      genome - the known genome from which the simulated reads are generated
      n - read length; all simulated reads will have this exact length.
      nreads - the number of reads to generate; this has been calculated from
         genome size and the requested <coverage>.
      method - method to use to simulate reads with errors (default NONE)
  
   Output parameters:
  
      reads - the simulated reads.  note that these simulated reads will have
         artificially generated errors; true_reads gives the actual, error-free original
         simulated reads.
      true_reads - the true original simulated reads, before any errors are introduced.
         only generated if `generate_true_reads' is true.  of course, only for
         simulated reads can we know this with certainty.
      generate_true_reads - whether to generate true reads
      quals - qualities of the simulated reads
      pairs - the pairings of the simulated reads, showing for each simulated read pair
         which two reads in _reads_ comprise that pair
      loc_on_ref - the true location of each read on the genome (of course, only for
         simulated reads can we know this with certainty.)

   Shares code with <CreateRandomPairs()>.
*/
void CreateRandomReads( int nreads, vec<read_location>& loc_on_ref, 
                        const vecbasevector& genome, vecbasevector& reads, vecqualvector& quals, 
                        vecbasevector& true_reads, int n,
			methodType method = NONE,
                        const vec<double>* mutation_rates_ptr = 0, 
                        const ErrorGenerator* egen_ptr = 0,
                        Bool generate_true_reads = False,
                        vec<int>* error_count_ptr = 0)
{    
  cout << "Creating " << nreads << " unpaired reads of length " << n << " ";
  if ( ERROR_GENERATOR == method)
    cout << "using error generator." << endl;
  else if ( MUTATION_RATES == method)
    cout << "using mutation rates." << endl;
  else if ( NONE == method )
    cout << "without errors." << endl;

  // Determine number of bases in genome (gbases) a read can start from given the
  // minimum and maximum size (on the reference) of a read
  int min_bases = n, max_bases = n;
  if (ERROR_GENERATOR == method)
    egen_ptr->GetMinMaxRefBaseCount(min_bases, max_bases, n);
  map<int, longlong> gbases;
  for ( int nbases = min_bases; nbases <= max_bases; ++nbases)
    gbases[nbases] = CalculateGenomeBases(genome, nbases);
  
  vec<int> muts_per_mil;
  if (MUTATION_RATES == method)
    GetMutsPerMil(n, *mutation_rates_ptr, muts_per_mil);

  loc_on_ref.reserve(nreads);

  // Generate each read in turn
  for ( int i = 0; i < nreads; i++ ) {
    longlong pos = big_random( );  // Random number used to pick pos on genome
    bool rc = random() & 1;

    int nbases = n; // number of bases on reference required to generate read

    AlignmentProfile profile;
    // Use error generator to pick error profile with quality scores
    if ( ERROR_GENERATOR == method ) {
      qualvector qual;
      egen_ptr->GetErrorProfile(profile, qual, n);
      quals.push_back_reserve(qual);
      error_count_ptr->push_back(profile.GetErrorCount());
      nbases = profile.GetRefBaseCount();  // number of bases on reference required
    }

    // Pick random position in entire genome for start of read
    pos %= gbases[nbases];

    // Find contig and position on contig for start of read
    int contig = FindContigAndUpdatePosition(genome, nbases, pos);

    const basevector& g = genome[contig];
    
    static basevector b;
    b.SetToSubOf( g, pos, n );
    if (rc) b.ReverseComplement();
    if (generate_true_reads) true_reads.push_back(b);
    
    // Use profile from error generator to create simulated reads with errors
    if ( ERROR_GENERATOR == method )
      if ( rc )
	profile.CreateRcRead(g, pos+nbases-1, b);
      else
	profile.CreateRead(g, pos, b);
    
    // Use mutation rate or rate table create simulated reads with errors
    if ( MUTATION_RATES == method )
      ApplyMutationRates(b, muts_per_mil);

    loc_on_ref.push_back( read_location( reads.size( ), n, contig,
					 pos, ( rc ? ReverseOr : ForwardOr ), g.size( ) ) );
    reads.push_back(b);

    if ( i > 0 && ( i % (nreads/100) == 0 ) )
      Dot( cout, i / (nreads/100) - 1 );
  }
  cout << endl; 
}

/**
   Local function: SimulateReads_Construction_A

   Simulate reads according to Construction A.

   - chop G into n-base perfect reads, shifting by one base to get the next read;
   - for each two reads whose span is exactly N bases, generate a read pair whose
     separation is N-2n and whose standard deviation is dev, then randomly delete
     half the pairs so that no read lies in two pairs;
   - chop G into 20kb chunks, treat as extra reads (temporary, see below) - this
     facilitates good k-mer numbering;
   - build k-mer paths for reads, then discard the 20kb reads.
*/
void SimulateReads_Construction_A( const String& run_dir, ostream& log, const String& logfull, 
                                   const vecbasevector& genome,
				   const int npairs, const int hn, const int n, const int N,  
				   const double dev,

				   longlong& nreads, longlong& reads_rawsize,
				   vecbasevector& reads,  vecqualvector& quals,
				   vec<read_pairing>& pairs,
				   vec<read_location>& loc_on_ref )  
{
  // Generate provisional assembly data structures.
  {    
    // Build reads.fastb and initial read pairs.
    
    vecbasevector reads, hugereads;
    longlong hugereads_rawsize = 0, nhugereads = 0;
    for ( int pass = 1; pass <= 2; pass++ ) {
      if ( pass == 2 ) {
        reads.Reserve( reads_rawsize, nreads );
	pairs.reserve(nreads);
	hugereads.Reserve( hugereads_rawsize, nhugereads );    
      }
      for ( int i = 0; i < genome.size( ); i++ ) {
        const basevector& g = genome[i];
	static basevector r;
	for ( int j = 0; j <= g.isize( ) - n; j++ ) {
          if ( pass == 1 ) {
            reads_rawsize += (n+15)/16;
	    nreads++;    
          }
	  else {
            r.SetToSubOf( g, j, n );
	    reads.push_back(r);    
	    if ( j <= g.isize( ) - N ) {
              read_pairing p;
	      p.id1 = reads.size( ) - 1;
	      p.id2 = p.id1 + N - n;
	      p.sep = N - 2*n;
	      p.sd = int(round(dev));
	      p.t = other;
	      p.weight = 1;
	      pairs.push_back(p);
            }
          }
        }
	if ( g.isize( ) >= hn ) {
          for ( int j = 0; j < g.isize( ); j+= hn ) {
            if ( j > g.isize( ) - hn ) j = g.isize( ) - hn;
	    if ( pass == 1 ) {
              hugereads_rawsize += (hn+15)/16;
	      nhugereads++;    
            }
	    else {
              r.SetToSubOf( g, j, hn );
	      hugereads.push_back(r);
            }
          }
        }
      }
    }
    Run( "/bin/rm -f " + run_dir + "/reads.fastb*", log, logfull );
    
    // Define final read pairs.  Reverse complement partners.
    
    vec<Bool> remove_pair( pairs.size( ), True ), used( nreads, False );
    vec<int> shuffle;
    Shuffle( pairs.size( ), shuffle );
    for ( int i = 0; i < shuffle.isize( ); i++ ) {
      int pi = shuffle[i];
      int id1 = pairs[pi].id1, id2 = pairs[pi].id2;
      if ( used[id1] || used[id2] ) continue;
      used[id1] = True;
      used[id2] = True;
      reads[id2].ReverseComplement( );
      remove_pair[pi] = False;    
    }
    EraseIf( pairs, remove_pair );
    
    // Write files.
    
    reads.Write( run_dir + "/reads.fastb", 0, nreads );
    hugereads.Write( run_dir + "/reads.fastb", 0, nhugereads );
    MergeMastervecFiles( run_dir + "/reads.fastb" );
    WritePairs( run_dir, pairs, nreads );    
  }
  
  reads.ReadAll( run_dir + "/reads.fastb" );
  reads.resize(nreads);
  reads.WriteAll( run_dir + "/reads.fastb" );
}


// Local function: MarkDuplicatedKmers
//
// Mark all k-mers which appear more than once in G or its
// reversecomplement.
template<int K> void MarkDuplicatedKmers( const vecbasevector& G, vecbitvector& dup )
{ 
  int N = G.size( );
  longlong totalKmers = 0;
  vec<int> kmersPerRead(N);
  for ( int id = 0; id < N; ++id ) {
    int numKmers = Max( 0, G[id].isize() - K + 1 );
    totalKmers += ( kmersPerRead[id] = numKmers );    
  }
  dup.clear( );
  dup.Reserve( totalKmers/32 + N, N );
  for ( int id = 0; id < N; ++id ) {
    dup.push_back( bitvector( kmersPerRead[id] ) );
    dup.back( ).Zero( );    
  }
  vec<int> rid(N);
  for ( int i = 0; i < N; i++ )
    rid[i] = i;
  unsigned int S = 0;
  for ( int l = 0; l < G.size( ); l++ )
    S += G[l].size( ) - K + 1;
  S += S/4;
  S /= 33;
  vec< kmer_record<K,2> > R(S);
  cout << "pass " << flush;
  for ( int pass = 0; pass < 100; pass++ ) {
    dummy<100> d100;
    SortKmers( d100, G, rid, pass, R, S );
    int i, j;
    for ( i = 0; i < (int) S; i++ ) {
      for ( j = i+1; j < (int) S; j++ ) {
        int l;
        for ( l = (K+3)/4 - 1; l >= 0; l-- )
          if ( R[j].Bytes( )[l] != R[i].Bytes( )[l] ) break;
        if ( l >= 0 ) break;    
      }
      if ( j - i > 1 ) {
        for ( int r = i; r < j; r++ ) {
          int id = R[r].GetId( );
          int pos = R[r].GetPos( );
          if ( pos < 0 ) pos = -pos;
          --pos;
          dup[id].Set( pos, True );    
        }
      }
      i = j - 1;    
    }
    if ( pass == 97 ) cout << "1";
    else if ( pass % 10 == 8 ) cout << (pass/10 + 1) % 10;
    else if ( pass % 10 == 9 ) cout << "0 ";
    else cout << ".";
    flush(cout);    
  }
  cout << endl;
}  // MarkDuplicatedKmers()

/**
   Local function: SimulateReads_Construction_B

   Intended as a memory-efficient replacement for <Construction A>:
  
   - find the k-mers of G which appear more than once in G (or its reverse
     complement);
   - find the maximal subsequences of G which do not contain duplicate k-mers in
     their interiors, and make then into reads if they have length >= n;
   - every subsequence of length n which contains a duplicate k-mer (and not 
     subsumed by a maximal subsequence) is made into a read;
   - at random create npairs read pairs, consisting of n-base perfect reads from
     N-base segments on G; these are assigned insert parameters (N-2n, dev);
   - build k-mer paths for the reads.
   
   For construction B, we require that a paths file for the genome has been created
   using <GenomeToPaths>.
*/
void SimulateReads_Construction_B( const vecbasevector& genome,
				   const int K, const int npairs, 
                                   const int hn, const int n, const int N,
				   const double dev,

				   longlong& nreads, longlong& reads_rawsize, 
				   vecbasevector& reads,  vecqualvector& quals,
				   vec<read_pairing>& pairs,
				   vec<read_location>& loc_on_ref )
{    // Find the duplicated k-mers.
  cout << "finding duplicated k-mers:\n";
  vecbitvector dup;
#define DO_MARK(_K) MarkDuplicatedKmers<_K>( genome, dup )
  DISPATCH_ON_K(K, DO_MARK);
  longlong totalkmers = 0, dupkmers = 0;
  for ( int i = 0; i < dup.size( ); i++ ) {
    for ( int j = 0; j < (int) dup[i].size( ); j++ ) {
      ++totalkmers;
      if ( dup[i][j] ) ++dupkmers;
    }
  }
  cout << PERCENT_RATIO( 3, dupkmers, totalkmers ) 
       << " of kmers are duplicated" << endl;
  
  // Find the reads.  We could do a little better here because there is no 
  // need to break the reads at points where they have single duplicate 
  // kmers, etc.  Also, even though a kmer is duplicated, it does not 
  // follow that it is a branch point.
  
  pairs.reserve(npairs);
  for ( int pass = 1; pass <= 2; pass++ ) {
    if ( pass == 2 ) {
      reads_rawsize += longlong(2*npairs) * longlong(n+15) / longlong(16);
      nreads += 2 * npairs;
      reads.Reserve( reads_rawsize, nreads );    
      vecbasevector true_reads;
      loc_on_ref.reserve(nreads);
      CreateRandomPairs( npairs, pairs, loc_on_ref, genome, reads, quals,
                         true_reads, N, n, dev );    
    }
    
    // Core logic for the next chunk of code:
    //
    // vec<Bool> stops( g.size( ) + 1, False );
    // int last_j = 0; 
    // for ( int j = 0; j <= g.isize( ) - K; j++ )
    // {    int len = j - last_j + K;
    //      if ( j == g.isize( ) - K || d[j] || len == hn )
    //      {    if ( len >= n )
    //           {    stops[ last_j + len ] = True;
    //                -- read from last_j to j+K --    }
    //           if ( d[j] )
    //           {    for ( int u = Max( 0, j - (n-K) ); u < j; u++ )
    //                {    if ( u + n > g.isize( ) ) continue;
    //                     if ( stops[u+n] ) continue;
    //                     stops[u+n] = True;
    //                     -- read from u to u+n --    }
    //           last_j = j;
    //           if ( len == hn && !d[j] ) last_j -= (n-K+1);    }    }
    
    for ( int i = 0; i < genome.size( ); i++ ) {
      const basevector& g = genome[i];
      const bitvector& d = dup[i];
      static vec<Bool> stops;
      stops.resize_and_set( g.size( ) + 1, False );
      int last_j = 0;
      for ( int j = 0; j <= g.isize( ) - K; j++ ) {
        int len = j - last_j + K;
	if ( j == g.isize( ) - K || d[j] || len == hn ) {
          static basevector b;
	  if ( len >= n ) {
            stops[ last_j + len ] = True;
	    if ( pass == 1 ) {
              reads_rawsize += (len+15)/16;
	      // cout << i << ".[" << last_j << "," // XXX
	      //      << j+K << "), id = " << nreads // XX
	      //      << "\n"; // XXXXXXXXXXXXXXXXXXXXXXXX
	      ++nreads;    
            }
	    else {
              b.SetToSubOf( g, last_j, len );
	      loc_on_ref.push_back( read_location( reads.size( ), len, i, last_j,
                                                   ForwardOr, g.size( ) ) );
	      reads.push_back(b);    
            }
          }
	  if ( d[j] ) {
            // cout << "duplicate k-mer at " << j // XXXXXXXX
            //      << "\n"; // XXXXXXXXXXXXXXXXXXXXXXXXXXXXX
            for ( int u = Max( 0, j - (n-K) ); u < j; u++ ) {
              if ( u + n > g.isize( ) ) continue;
              if ( stops[u+n] ) continue;
              stops[u+n] = True;
              if ( pass == 1 ) {
                reads_rawsize += (n+15)/16;
                // cout << i << ".[" << u << "," // XXX
                //      << u+n << "), id = " // XXXXXXX
                //      << nreads << "\n"; // XXXXXXXXX
                ++nreads;    
              }
              else {
                b.SetToSubOf( g, u, n );
                loc_on_ref.push_back( read_location( reads.size( ), n, i, u,
                                                     ForwardOr, g.size( ) ) );
                reads.push_back(b);
              }
            }
          }
	  // if ( j == g.isize( ) - K ) break;
	  last_j = j;    
	  if ( len == hn && !d[j] ) 
	    last_j -= (n-K+1);    
        }    
      }    
    }    
  }
}  // SimulateReads_Construction_B()

/**
   Local function: SimulateReads_Construction_C

   Simulate reads according to Construction C.

   CONSTRUCTION=C (intended as a control for B):
  
   - chop G into n-base perfect reads, shifting by one base to get the next read;
   - at random create npairs read pairs, consisting of n-base perfect reads from
     N-base segments on G; these are assigned insert parameters (N-2n, dev);
   - build k-mer paths for the reads.
   
*/
void SimulateReads_Construction_C( const vecbasevector& genome,
				   const int npairs, const int n, const int N,  
				   const double dev,

				   longlong& nreads, longlong& reads_rawsize, 
				   vecbasevector& reads,  vecqualvector& quals,
				   vec<read_pairing>& pairs,
				   vec<read_location>& loc_on_ref )  
{
  pairs.reserve(npairs);
  for ( int pass = 1; pass <= 2; pass++ ) {
    if ( pass == 2 ) {
      reads_rawsize += longlong(2*npairs) * longlong(n+15) / longlong(16);
      nreads += 2 * npairs;
      reads.Reserve( reads_rawsize, nreads );    
      vecbasevector true_reads;
      CreateRandomPairs( npairs, pairs, loc_on_ref, genome, reads, 
                         quals, true_reads, N, n, dev );    
    }
    for ( int i = 0; i < genome.size( ); i++ ) {
      const basevector& g = genome[i];
      static basevector r;
      for ( int j = 0; j <= g.isize( ) - n; j++ ) {
        if ( pass == 1 ) {
          reads_rawsize += (n+15)/16;
	  nreads++;
        }
	else {
          r.SetToSubOf( g, j, n );
	  reads.push_back(r);    
        }
      }
    }
  }
}

/**
   Local function: Parse_Construction_D_Params

   Parse command-line parameters for <Construction D>.
*/
void Parse_Construction_D_Params( const String& PRE,
				  const String& LIBRARIES,
				  const int K, 
				  const String& ERROR_GENERATOR_NAME, 
                                  const String& MUTATION_PERCENT,
				  const int PLOIDY,
				  
				  vec<int>& ns, vec<int>& Ns, vec<int>& unpaired_ns,
				  vec<double>& devs, vec<double>& Cs, vec<double>& unpaired_Cs,
				  vec<double>& mutation_rates, 
                                  ErrorGenerator& error_generator, 
                                  Bool& make_perfect_reads ) 
{
  if (ERROR_GENERATOR_NAME != "") {
    error_generator = ErrorGenerator(PRE +"/" + ERROR_GENERATOR_NAME);
    if (ERROR_GENERATOR_NAME != "" && MUTATION_PERCENT != "-1.0") 
      InputErr("Use MUTATION_PERCENT or ERROR_GENERATOR_NAME, not both.");    

  } else if (MUTATION_PERCENT != "-1.0") {
    ParseDoubleSet( MUTATION_PERCENT, mutation_rates );
    for ( unsigned int i = 0; i < mutation_rates.size(); ++i )
      mutation_rates[i] /= 100.0;
  }
  
  make_perfect_reads = ( mutation_rates.size() == 1 &&
			 mutation_rates.front() == 0.0 &&
			 ERROR_GENERATOR_NAME == "");

  int n_default = -1, N_default = -1;
  double C_default = -1, dev_default = -1, dev_percent_default = -1;
  int max_n = 0;
  vec<String> libraries;
  Tokenize( LIBRARIES, libraries, ":" );
  for ( int i = 0; i < libraries.isize( ); i++ ) {
    vec<String> library;
    Tokenize( libraries[i], library, "," );
    int n_val = n_default;
    int N_val = N_default;
    double dev_val = dev_default;
    double dev_percent_val = dev_percent_default;
    double C_val = C_default;
    for ( int j = 0; j < library.isize( ); j++ ) {
      const String& l = library[j];
      if ( l.Contains( "n=" ) )
	n_default = n_val = l.After( "n=" ).Int( );
      else if ( l.Contains( "N=" ) )
	N_default = N_val = l.After( "N=" ).Int( );
      else if ( l.Contains( "dev=" ) && l.Contains( "%" ) ) {
        dev_percent_default = dev_percent_val = l.After( "dev=" ).Before( "%" ).Double( );
	dev_default = dev_val = -1;    
      }
      else if ( l.Contains( "dev=" ) ) {
        dev_default = dev_val = l.After( "dev=" ).Double( );
	dev_percent_default = dev_percent_val = -1;    
      }
      else if ( l.Contains( "C=" ) )
	C_default = C_val = l.After( "C=" ).Double( );
      else FatalErr( "Don't know what " << l << " is." );    
    }
    if ( dev_percent_val >= 0 && N_val >= 0 )
      dev_val = double(N_val) * dev_percent_val / 100.0;
    if ( n_val >= 0 && N_val >= 0 && dev_val >= 0 && C_val >= 0 ) {
      ns.push_back(n_val);
      Ns.push_back(N_val);
      devs.push_back(dev_val);
      Cs.push_back(C_val);    
    }
    else if ( n_val >= 0 && C_val >= 0 && N_val < 0 && dev_val < 0 ) {
      unpaired_ns.push_back(n_val);
      unpaired_Cs.push_back(C_val);    
    }
    else FatalErr( "Can't grok library " << libraries[i] << "." );
    if ( mutation_rates.size() > 1 && n_val > mutation_rates.isize() ) {
      cout << "Not enough positional mutation rates specified.  Library "
	   << "'" << libraries[i] << "'" << endl
	   << "has reads of length " << n_val << "." << endl;
    }
    max_n = Max(max_n, n_val);
  }
  for ( int i = 0; i < ns.isize( ); i++ ) {
    ForceAssertLe( 2*ns[i], Ns[i] );
    ForceAssertLt( K, ns[i] );    
  } 
  
  // Check if the error generator can produce reads of the required length
  if (ERROR_GENERATOR_NAME != "" && max_n > error_generator.MaxReadSize()) {
    cout << "Maximum read size allowed by selected Error Generator is "
	 << error_generator.MaxReadSize() << "\nAbort.\n";
    exit(1);
  }
  /*
    ForceAssert( ! mutation_rates.empty() );
    for ( unsigned int i = 0; i < mutation_rates.size(); ++i )
    ForceAssert( mutation_rates[i] >= 0.0 );    
  */
  // Adjust coverage values if ploidy = 2
  if ( PLOIDY == 2 ) {
    for ( int i = 0; i < Cs.isize( ); i++ )
      Cs[i] /= 2.0;
    for ( int i = 0; i < unpaired_Cs.isize( ); i++ )
      unpaired_Cs[i] /= 2.0;    
  }
}  // Parse_Construction_D_Params()



/**
   Local function: SimulateReads_Construction_D

   There are four parameters: n, the read size (all simulated reads will have this
   exact size); N, the <fragment> size; C, the coverage, defined as
   C = ((# of reads) x (read size n) / (genome size G)); and dev, the allowed variation
   in fragment size N (dev=10% meaning 10% of N).  This is for generating paired reads;
   for generating unpaired reads, you specify only the read length n and the coverage
   C; see below for details.  See <CreateRandomPairs()> for implementation.
  
   - the argument LIBRARIES should have the form n=n1,N=N1,dev=dev1,C=C1 or
     n=n1,N=N1,dev=dev1,C=C1:n=n2,N=N2,dev=dev2,C=C2 etc., specifying one, two, 
     or more libraries;
   - we allow e.g. dev=10%, meaning 10% of N;
   - if any of the variables n, N, dev, or C is unspecified, but was specified for a
     previous library, inherit that value, inheriting percentages as percentages 
     rather than in terms of the N it was specified with before;
   - we also allow a library i to be specified by n=ni,C=Ci
     which causes unpaired reads of length ni to be created at random to coverage Ci
     (but BEWARE of the preceding rule);
   - for each paired-end library, at random create read pairs at sequence coverage 
     C, consisting of n-base imperfect reads from (N+/-dev)-base segments on G, with
     substitutions introduced at rate MUTATION_PERCENT or using an error template
     specified with ERROR_GENERATOR_NAME; these are assigned insert parameters (N-2n, dev);
   - use either MUTATION_PERCENT or ERROR_GENERATOR_NAME, not both at the same time.
   - The arg MUTATION_PERCENT also accepts the name of a file (preceded by "@"
     containing per-position error rates.
   - Optional error correction.
   - build k-mer paths for the reads.
   - if MUTATION_PERCENT=0, we require that a paths file for the genome has been
     created using GenomeToPaths.
   
*/
void SimulateReads_Construction_D( const String& run_dir,
				   const vecbasevector& genome,
				   const vec<int>& ns, const vec<int>& Ns, const vec<int>& unpaired_ns,
				   const vec<double>& devs, const vec<double>& Cs, const vec<double>& unpaired_Cs,
				   const int K,
				   const ErrorGenerator& error_generator, const vec<double>& mutation_rates,
				   const Bool make_perfect_reads,

				   longlong& nreads, longlong& reads_rawsize, longlong& quals_rawsize,
				   double& avg_mutation_rate,
				   vecbasevector& reads,
				   vecqualvector& quals,
				   vec<read_pairing>& pairs,
				   vec<read_location>& loc_on_ref
				   ) 
{
  longlong genome_bases = 0;
  for ( int i = 0; i < genome.size( ); i++ )
    genome_bases += genome[i].size( );
  vecbasevector true_reads;
  vec<int> error_count;
  int total_npairs = 0;
  double kmerCov = 0.0;
  longlong num_bases = 0;
  for ( int i = 0; i < ns.isize( ); i++ ) {
    int n = ns[i];
    double C = Cs[i];
    kmerCov += C * (double)( n - K + 1 ) / (double)( n );
    int npairs = int( round( float(genome_bases) * C / float( 2 * n ) ) );
    total_npairs += npairs;
    reads_rawsize += longlong(2*npairs) * longlong(n+15) / longlong(16);
    quals_rawsize += longlong(2*npairs) * n;
    nreads += 2 * npairs;    
    double one_read_mutation_rate = 0.0;
    if ( mutation_rates.size() == 1 )
      one_read_mutation_rate += mutation_rates.front() * (double) n;
    else if (mutation_rates.size() > 1) {
      for ( int j = 0; j < n; ++j )
	one_read_mutation_rate += mutation_rates[j];
    }
    avg_mutation_rate += 2 * npairs * one_read_mutation_rate;
    num_bases += 2 * npairs * n;
  }
  for ( int i = 0; i < unpaired_ns.isize( ); i++ ) {
    int n = unpaired_ns[i];
    double C = unpaired_Cs[i];
    kmerCov += C * (double)( n - K + 1 ) / (double)( n );
    int nr = int( round( float(genome_bases) * C / float(n) ) );
    reads_rawsize += longlong(nr) * longlong(n+15) / longlong(16);
    quals_rawsize += longlong(nr) * n;
    nreads += nr;    
    double one_read_mutation_rate = 0.0;
    if ( mutation_rates.size() == 1 )
      one_read_mutation_rate += mutation_rates.front() * (double) n;
    else if (mutation_rates.size() > 1) {
      for ( int j = 0; j < n; ++j )
	one_read_mutation_rate += mutation_rates[j];
    }
    avg_mutation_rate += nr * one_read_mutation_rate;
    num_bases += nr * n;
  }
  avg_mutation_rate /= (double) num_bases;

  methodType method;
  if (error_generator.QualSize() != 0)
    method = ERROR_GENERATOR;
  else if (mutation_rates.size() != 0)
    method = MUTATION_RATES;
  else
    method = NONE;

  reads.Reserve( reads_rawsize, nreads );
  quals.Reserve(quals_rawsize, nreads);    
  true_reads.Reserve( reads_rawsize, nreads );
  true_reads.reserve(nreads);    
  pairs.reserve(total_npairs);
  for ( int i = 0; i < ns.isize( ); i++ ) {
    int n = ns[i], N = Ns[i];
    double dev = devs[i], C = Cs[i];
    int npairs = int( round( float(genome_bases) * C / float( 2 * n ) ) );
    CreateRandomPairs( npairs, pairs, loc_on_ref, genome, reads, quals,
		       true_reads, N, n, dev, True, method, &mutation_rates,
		       &error_generator, True, &error_count);
  }
  for ( int i = 0; i < unpaired_ns.isize( ); i++ ) {
    int n = unpaired_ns[i];
    double C = unpaired_Cs[i];
    int nr = int( round( float(genome_bases) * C / float(n) ) );
    CreateRandomReads( nr, loc_on_ref, genome, reads, quals, true_reads, n, method,
		       &mutation_rates,  &error_generator, True, &error_count);    
  }
  if (!error_count.empty())
    BinaryWrite3(run_dir + "/reads.error_count", error_count);
  if ( ! make_perfect_reads )
    true_reads.WriteAll( run_dir + "/reads.true.fastb" );
  else Symlink( "reads.fastb", run_dir + "/reads.true.fastb" );
  
}  // SimulateReads_Construction_D()


/**
   Local function: WriteSimulatedReadFiles

   Do the final writing out of the simulated reads and related files;
   this is done the same way regardless of the construction by which the reads
   were simulated.

   Writes data files:

       reads.fastb - the simulated reads
       reads.qualb - the simulated quality scores
       reads.pairto, reads.pairtob, reads.pairto_index - for paired simulated reads, the
         read pairings (pairs of reads in reads.fastb between which the approximate
	 distance is known).
       reads.ref.locs - locations of the reads on the reference
       reads.lengths - the lengths of the reads.
       
*/
void WriteSimulatedReadFiles( const String& run_dir,
			      const vecbasevector& genome,
			      const String& CONSTRUCTION,
			      const Bool make_perfect_reads,
			      const double avg_mutation_rate,
			      const int nreads,
			      const vecbasevector& reads,
			      const vecqualvector& quals,
			      const vec<read_pairing>& pairs,
			      vec<read_location>& loc_on_ref ) 
{
  // Write simulated reads and pairing data
  longlong numreads = reads.size( );
  reads.WriteAll( run_dir + "/reads.fastb" );
  WritePairs( run_dir, pairs, numreads );
  
  // Write Qualb file if available
  String qualb_file = "reads.qualb";
  if (!quals.empty())
    quals.WriteAll(run_dir + "/" +qualb_file);
  
  // Write read locations on reference
  Sort(loc_on_ref);
  vec<read_location> loc_on_ref_s(loc_on_ref);
  Sort(loc_on_ref_s);
  WriteLocs( run_dir + "/reads.ref.locs", loc_on_ref_s, genome.size( ), 
             nreads );    
  
  
  flush(cout);
  
  // Write read length information
  vec<int> lengths(numreads);
  for ( int i = 0; i < numreads; i++ )
    lengths[i] = reads[i].size( );
  BinaryWrite2( run_dir + "/reads.lengths", lengths );
  
  // Write out some properties of the reads, specifying that these are simulated reads,
  // and whether we have simulated errors in the reads.
  
  Properties readProperties;
  readProperties["CONSTRUCTION"] = CONSTRUCTION;
  readProperties.SetBool( "PERFECT_READS", CONSTRUCTION == "A" || CONSTRUCTION == "B" || CONSTRUCTION == "C" ||
                          CONSTRUCTION == "D"  &&  make_perfect_reads );
  readProperties.SetDouble( "AVG_MUTATION_RATE", avg_mutation_rate );
  readProperties.Store( run_dir + "/reads.props" );
  
}  // WriteSimulatedReadFiles()



int main( int argc, char *argv[] )
{
  RunTime( );
  
  BeginCommandArguments;
  CommandArgument_String_Doc(CONSTRUCTION, "method for generating simulated reads");
  CommandArgument_String(PRE);
  CommandArgument_String_Doc(DATA, "data dir");
  CommandArgument_String_Doc(RUN, "run dir");
  CommandArgument_Int_OrDefault_Doc(n, -1, "read length");
  CommandArgument_Int_OrDefault_Doc(N, -1, "fragment length");
  CommandArgument_Double_OrDefault_Doc(dev, -1.0,
                                       "stddev for the variation in fragment length");
  CommandArgument_Int_Doc(K, "kmer size for constructions B and D");
  CommandArgument_Int_OrDefault_Doc(npairs, 0,
                                    "number of read pairs to create for constructions B and C");
  CommandArgument_Double_OrDefault(C, -1.0);
  // The arg MUTATION_PERCENT also accepts the name of a file
  // (preceded by "@") containing per-position error rates.
  CommandArgument_String_OrDefault(MUTATION_PERCENT, "-1.0");
  CommandArgument_String_OrDefault_Doc(LIBRARIES, "", "for construction D, "
                                       "n=n1,N=N1,dev=dev1,C=C1 or "
                                       "n=n1,N=N1,dev=dev1,C=C1:n=n2,N=N2,dev=dev2,C=C2 etc., "
                                       "specifying one, two, or more libraries" );
  CommandArgument_UnsignedInt_OrDefault_Doc(SEED, 0,
                                            "random number seed (for choosing where the reads land);"
                                            " only applies to some of the random numbers generated.");
  CommandArgument_String_OrDefault(ERROR_GENERATOR_NAME, "");
  CommandArgument_Bool_OrDefault_Doc(FORCE, False,
                                     "if run dir already exists, destroy and overwrite it");

  EndCommandArguments;

  const int hn = 20000; // huge read size
  
  // Define directories.
  
  ForceAssert( DATA != "" );
  ForceAssert( RUN != "" );
  const String data_dir = PRE + "/" + DATA;
  const String run_dir = PRE + "/" + DATA + "/" + RUN;
  
  if ( !IsDirectory(data_dir) ) {
    cout << "The DATA directory does not exist.\n";
    cout << "DATA directory = " << data_dir << "\n";
    exit(1);    
  }
  if ( IsDirectory(run_dir) ) {
    if (FORCE) SystemSucceed( "/bin/rm -rf " + run_dir );
    else {
      cout << "The RUN directory should not exist.\n";
      cout << "RUN directory = " << run_dir << "\n";
      exit(1);
    }
  }

  // Validate those parameters which we can validate without doing anything complicated 
  // first:
  
  // General
  
  
  ForceAssert( CONSTRUCTION == "A" || CONSTRUCTION == "B" 
               || CONSTRUCTION == "C" || CONSTRUCTION == "D" );
  
  ForceAssertSupportedK( K );
  
  // Construction A, B, C
  if (CONSTRUCTION != "D") {
    ForceAssert( LIBRARIES == "" );
    ForceAssert( ERROR_GENERATOR_NAME == "");
    ForceAssert( MUTATION_PERCENT == "-1.0");   
  }
  //         ForceAssert( make_perfect_reads );
  // Construction D
  if (CONSTRUCTION == "D") {
    ForceAssert( LIBRARIES != "" );
    if (ERROR_GENERATOR_NAME != "" && MUTATION_PERCENT != "-1.0") 
      InputErr("Use MUTATION_PERCENT or ERROR_GENERATOR_NAME, not both.");
  }
     
  // Construction A, B, C
  if (CONSTRUCTION != "D" ) {
    ForceAssert( n != -1 );
    ForceAssert( N != -1 );
    ForceAssert( dev != -1.0 );
    ForceAssertLe( 2*n, N );
    ForceAssertLt( n, hn );
    ForceAssertLt( K, n );    
  }
  // Construction D, REAL
  if (CONSTRUCTION == "D") {
    ForceAssertEq( n, -1 );
    ForceAssertEq( N, -1 );
    ForceAssert( dev == -1.0 );
    ForceAssert( C == -1.0 );
  }
     
  // Construction B, C
  if ( CONSTRUCTION == "B" || CONSTRUCTION == "C" ) {
    ForceAssertGt( npairs, 0 );    
  }
  // Construction A, D, REAL
  if ( CONSTRUCTION != "B" && CONSTRUCTION != "C" ) {
    ForceAssert( npairs == 0 );    
  }

  // Determine ploidy.
  if ( !IsRegularFile( data_dir + "/ploidy" ) ) {
    cout << "There must be a ploidy file.\n";
    cout << "Abort.\n";
    exit(1);    
  }
  int PLOIDY = FirstLineOfFile( data_dir + "/ploidy" ).Int( );
  ForceAssert( PLOIDY == 1 || PLOIDY == 2 );

  // Set up timer for the assembly part of this process.
  double assembly_clock = 0.0;
  
  // Set random seed.
  if ( SEED != 0 ) srandomx(SEED);
  
  // Local vars: Params for Construction D
  // Error generator or mutation rates
  ErrorGenerator error_generator;
  vec<double> mutation_rates;
  double avg_mutation_rate = 0.0;
  Bool make_perfect_reads;
  
  // Library construction information ( Construction D ) 
  vec<int> ns, Ns, unpaired_ns;
  vec<double> devs, Cs, unpaired_Cs;
  
  // Parse library construction D information from the command line
  if ( CONSTRUCTION == "D" )
    Parse_Construction_D_Params( PRE, LIBRARIES, K, ERROR_GENERATOR_NAME, MUTATION_PERCENT, PLOIDY,
                                 ns, Ns, unpaired_ns,
                                 devs, Cs, unpaired_Cs,
                                 mutation_rates, error_generator,
                                 make_perfect_reads );

  // Load genome from which we'll generate the simulated reads.  Get genome size.  
  vecbasevector genome;
  longlong genome_size = 0;
  
  // Prepare run directory
  Mkdir777(run_dir);
  cout << "Created run dir " << run_dir << endl;
  
  // Calculate genome size directly
  genome.ReadAll(data_dir + "/genome.fastb");
  for ( int i = 0; i < genome.size( ); i++ )
    genome_size += genome[i].size( ); 
  {
    String genomeSizeFileName = data_dir + "/genome.size";
    ofstream genomeSizeFile( genomeSizeFileName.c_str() );
    ForceAssert( genomeSizeFile.good() );
    genomeSizeFile << genome_size;
  }
       

  // Local vars: Simulation output vars
  // Read and pairing variables, where the chosen construction will put its result.
  longlong nreads = 0, reads_rawsize = 0, quals_rawsize = 0;
  vecbasevector reads;
  vecqualvector quals;
  vec<read_pairing> pairs;
  vec<read_location> loc_on_ref;
  
  
  // Start Logging
  Ofstream( log, run_dir + "/SimulateReads.log" );
  String logfull = run_dir + "/SimulateReads.log.full";
  Remove(logfull);
  cout << "logging to\n" << run_dir + "/SimulateReads.log"
       << "\nand " << run_dir + "/SimulateReads.log.full\n" << endl;
  command.PrintTheCommandPretty( log, CSTRING_FROM_DEFINE( MAKE_DATE ) );
  
  
  if ( CONSTRUCTION == "A" )
    SimulateReads_Construction_A( run_dir, log, logfull, genome,
                                  npairs, hn, n, N, dev,
  
                                nreads, reads_rawsize,
                                  reads, quals, pairs, loc_on_ref );
  
  if ( CONSTRUCTION == "B" )
    SimulateReads_Construction_B( genome, K, npairs, hn, n, N, dev,
                                  
                                  nreads, reads_rawsize,
                                  reads, quals, pairs, loc_on_ref );
  
  if ( CONSTRUCTION == "C" )
    SimulateReads_Construction_C( genome, npairs, n, N, dev,
                                  
                                  nreads, reads_rawsize, 
                                  reads, quals, pairs, loc_on_ref );
  
  if ( CONSTRUCTION == "D" )
    SimulateReads_Construction_D( run_dir, genome, ns, Ns, unpaired_ns, devs, Cs, unpaired_Cs,
                                  K, error_generator, mutation_rates, make_perfect_reads,
                                  nreads, reads_rawsize, quals_rawsize, avg_mutation_rate,
                                  reads, quals, pairs, loc_on_ref );
  
  WriteSimulatedReadFiles( run_dir, genome, CONSTRUCTION, make_perfect_reads, 
                           avg_mutation_rate, nreads,
                           reads, quals, pairs, loc_on_ref );
     
}
