/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2005) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

/*
   Program: ReadsToPaths
  
   Convert reads to paths in k-mer space.  What this means is that we
   assign a unique integer (<kmer number>) to each k-mer which appears in the reads or their
   reverse complements, and we then represent each read (and reverse complement of
   each read) as the corresponding sequence of integers (called a <read path>), 
   according to its constituent sequence of overlapping k-mers (starting at the 
   first base).  Moreover, this is done in such a way that the typical path is the 
   concatenation of a short number of contiguous paths, 
   e.g. 500-1000, 12-53, 1200-1323.  This facilitates rapid computation.
  
   *Method*:  Order the reads so that reads with more high quality bases come first.  
   Then for each k-mer occurring in the reads, pick the occurrence o having the 
   lowest-numbered read (and lowest-numbered position if the k-mer appears twice
   on the lowest-numbered read), and assign to the k-mer the integer representing 
   the position of o in the concatenation of all the reads.  The code is adapted 
   from MakeAligns.
  
   Notes:
  
   1. Paths can be efficiently reversed (<KmerPath::Reverse()>), so one might save
   by not storing the paths for the reverse complements of reads.
  
   2. There are undoubtedly better algorithms for generating the read paths --
   better in the sense that the average number of segments per read path is lower.
   For example, in principle one could start with contigs, and use them to number 
   the k-mers.
  
   Stats for dog
   K=96, using error-corrected reads: 
         4.42 segments per read path, 17.5 hours, 2.74 GB final output file size
  
   INPUT FILES:
  
      - reads.fastb
      - reads.qualb
      - genome.size
  
   INTERMEDIATE FILES:
      - mutmers.gz
  
   OUTPUT FILES:
   
      - reads.paths(_rc).k*: these are vecKmerPath objects, which contain the full read
                             paths for each read (and each reverse complement of a read).
      - reads.pathsdb.k*: this is the vec<tagged_rpint> object, which is the KmerPath
                         database associated to reads.paths(_rc).
      - reads.paths.mult.k*
      - ReadsToPaths.log
  
   Note that use of reads.paths(_rc) and reads.pathsdb is deprecated -- they are
   supplanted by the corresponding high quality objects (having interspersed gaps).
   (We do use reads.pathsdb in PathsHQ.cc, but it could be created there for use on
   the fly.)
  
   If HEAD is specified, files are handled differently.
  
   Program parameters:
     PRE - the <WGA data dir>
     DATA - the <project dir>
     RUN - the <run dir>
     HEAD - an alternative way to specify the reads file directly rather than by parts
      ( PRE + DATA + RUN ).
     K - the size of kmers from which we construct the paths.
     VERBOSE - how much debug output to print
     BREAK_LONG - break very long reads to save memory
     GENOME_SIZE - if specified, overrides the <genome.size> file in the <project dir>
     USE_QUALITY_SCORES - Place reads in descending order by number of quality 40+ bases
     PATHS_ONLY - if true, write reads paths for reads but don't build and write
        paths for the reverse complements of the reads.
*/

#include "MainTools.h"
#include "paths/ReadsToPathsCore.h"

int main( int argc, char *argv[] )
{
     RunTime( );

     BeginCommandArguments;
     CommandArgument_String(PRE);
     CommandArgument_String_OrDefault(DATA, "");
     CommandArgument_String_OrDefault(RUN, "");
     CommandArgument_String_OrDefault(HEAD, "");
     CommandArgument_UnsignedInt_OrDefault(K, 48);
     CommandArgument_Bool_OrDefault(VERBOSE, False);
     CommandArgument_Bool_OrDefault(BREAK_LONG, True);
     CommandArgument_String_OrDefault(GENOME_SIZE, "");
     CommandArgument_Bool_OrDefault(USE_QUALITY_SCORES, True);
     CommandArgument_Bool_OrDefault(PATHS_ONLY, False);
     EndCommandArguments;

     ReadsToPathsCore( PRE, DATA, RUN, HEAD, K, VERBOSE, BREAK_LONG,
          GENOME_SIZE, USE_QUALITY_SCORES, PATHS_ONLY, cout );

     EXIT_MAIN_NORMALLY;    }
