/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2007) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

// HowUnique.  For a given genome, compute the fraction of perfect N-base reads
// that can be aligned uniquely to it and the fraction of the genome that would
// be covered by them.  Files GENOME.fastb and GENOME.lookup must be provided.
//
// If a read has a false placement having <= D mismatches, the read is counted
// as not aligning uniquely.  Only alignments subsuming a K-base perfect match
// are seen.
//
// If SAMPLE is specified, only use that many reads.  In that case, coverage
// fraction can't be computed.
//
// If SAMPLE > 0, reads that involve ambiguous bases are skipped.  
// If SAMPLE == 0, ambiguous bases are not handled correctly.
//
// If COUNT_PERFECT=True, count mean number of perfect placements, and don't do
// anything else.  For this, SAMPLE must also be specified.  If also
// COUNT_PERFECT_VERBOSE=1, show number of placements of each read.  If 
// COUNT_PERFECT_VERBOSE=2, also show sequence.
//
// COUNT_PERFECT_FALSE: modifier for COUNT_PERFECT, to subtract one from each count
//
// We allow N to be a ParseIntSet-style list.

#ifndef FORCE_DEBUG
     #define NDEBUG
#endif

#include "Basevector.h"
#include "Feudal.h"
#include "FeudalMimic.h"
#include "MainTools.h"
#include "ParseSet.h"
#include "lookup/ImperfectLookup.h"
#include "lookup/LookAlign.h"
#include "lookup/PerfectCount.h"
#include "lookup/PerfectLookup.h"
#include "math/Functions.h"
#include "random/Random.h"

const char* DOC =
"For a given genome, compute the fraction of perfect N-base reads "
"that can be aligned uniquely to it and the fraction of the genome that would "
"be covered by them.  Files GENOME.fastb and GENOME.lookup must be provided.";

int main( int argc, char *argv[] )
{
     RunTime( );

     BeginCommandArguments;
     CommandDoc( DOC );
     CommandArgument_String(GENOME);
     CommandArgument_String_OrDefault_Doc(FASTB, "", 
          "to override default GENOME.fastb" );
     CommandArgument_String_Doc(N, "length of reads");
     CommandArgument_Int_Doc(D, "number of mismatches allowed");
     CommandArgument_Int_OrDefault_Doc(K, 12, "matches must subsume a perfect K-mer");
     CommandArgument_Int_OrDefault_Doc(SAMPLE, 0, "how many samples? if 0, sample ALL N-mers");
     CommandArgument_Bool_OrDefault_Doc(COUNT_PERFECT, False,
                                        "show average # of perfect matches per read");
     CommandArgument_Bool_OrDefault_Doc(COUNT_PERFECT_FALSE, False,
                                        "show average # of perfect but false matches per read");
     CommandArgument_Int_OrDefault(COUNT_PERFECT_VERBOSE, 0);
     CommandArgument_Int_OrDefault_Doc(ERRORS, 0,
          "Introduce ERRORS mismatches into every read.");
     CommandArgument_Int_OrDefault_Doc(PRECISION, 3, "for printing results");
     EndCommandArguments;

     vec<int> Ns;
     ParseIntSet( N, Ns );

     if (COUNT_PERFECT_FALSE) COUNT_PERFECT = True;
     if (COUNT_PERFECT) ForceAssertGt( SAMPLE, 0 );

     vecbasevector genome( FASTB == "" ? GENOME + ".fastb" : FASTB );
     String ambig_file = GENOME + ".fastamb";
     vecbitvector ambig;
     if ( IsRegularFile( ambig_file ) )
       ambig.ReadAll( ambig_file );

     for ( int in = 0; in < Ns.isize( ); in++ )
     {    int N = Ns[in];

          vecbasevector reads;
          if ( SAMPLE == 0 )
          {    for ( int i = 0; i < genome.size( ); i++ )
               {    for ( int j = 0; j <= genome[i].isize( ) - N; j++ )
                    {    static basevector b;
                         b.SetToSubOf( genome[i], j, N );
                         reads.push_back_reserve(b);    }    }    }
          else
          {    vec<int> start;
               start.push_back(0);
               for ( int i = 0; i < genome.size( ); i++ )
                    start.push_back( start.back( ) + genome[i].size( ) );
               for ( int u = 0; u < SAMPLE; u++ )
               {    int r = randomx( ) % start.back( );
                    int g;
                    for ( g = 0; g < genome.size( ); g++ )
                         if ( r < start[g+1] ) break;
                    if ( r + N > start[g+1] )
                    {    --u;
                         continue;    }
                    if ( ! ambig.empty() ) {
                      bool hit_ambig = false;
                      for ( int b = 0; b < N; ++b )
                        if ( ambig[g][r-start[g]+b] ) {
                          hit_ambig = true;
                          break;
                        }
                      if ( hit_ambig ) {
                        --u;
                        continue;
                      }
                    }
                    static basevector b;
                    b.SetToSubOf( genome[g], r - start[g], N );
                    reads.push_back_reserve(b);    }    }
          if ( ERRORS > 0 )
          {    for ( int i = 0; i < reads.size( ); i++ )
               {    static vec<int> pos;
                    pos.clear( );
                    while(1)
                    {    int p = randomx( ) % N;
                         if ( Member( pos, p ) ) continue;
                         pos.push_back(p);
                         if ( pos.isize( ) == ERRORS ) break;    }
                    for ( int k = 0; k < ERRORS; k++ )
                    {    int p = pos[k];
                         int add = ( randomx( ) % 3 ) + 1;
                         reads[i].Set( p, ( reads[i][p] + add ) % 4 );    }    }    }

          vec<look_align> aligns;
          if (COUNT_PERFECT)
          {    vec<int> places;
               PerfectCountPlaces( reads, GENOME + ".lookup", FW_OR_RC, places );
               double meanp = Mean(places);
               if (COUNT_PERFECT_FALSE) --meanp;

               cout << "N = " << N << " --> ";
               RightPrecisionOut( cout, meanp, PRECISION );
               cout << " placements per read\n";
               if ( COUNT_PERFECT_VERBOSE == 1 )
               {    for ( int i = 0; i < SAMPLE; i++ )
                         cout << places[i] << "\n";    }
               if ( COUNT_PERFECT_VERBOSE == 2 )
               {    for ( int i = 0; i < SAMPLE; i++ )
                    {    reads[i].Print( cout, "read_" + ToString(i) + " - occurs " 
                              + ToString(places[i]) + " times" );    }    }
               continue;    }
          vec<int> min_errs;
          ImperfectLookup( K, reads, GENOME + ".lookup", aligns, 
               min_errs, FW_OR_RC, D );
          cout << PERCENT_RATIO( PRECISION, aligns.isize( ), reads.size( ) )
               << " of reads can be aligned uniquely" << endl;
          if ( SAMPLE == 0 )
          {    vecbitvector cov;
               Mimic( genome, cov );
               for ( int i = 0; i < aligns.isize( ); i++ )
               {    const look_align& la = aligns[i];
                    int start = la.pos2( ), stop = la.Pos2( );
                    int tig = la.target_id;
                    for ( int j = start; j < stop; j++ )
                         cov[tig].Set( j, True );    }
               cout << setprecision(PRECISION) << 100.0 * Coverage(cov)
                    << "% of genome covered by uniquely aligning reads" 
                    << endl;    }    }    }
