/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2006) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

// Program: ProtectGoodReads
//
// Declare certain original reads X to be good, and write a file
// reads.protected.orig that carries this information.  It is intended that
// downstream modules that edit or delete reads will read this file and protect
// reads accordingly.
//
// The definition of good is heuristic.  A read X is good if for every (K+1)-mer in
// it (default K=20), there are at least N other reads (default N=3) that contain
// that (K+1)-mer.
//
// If STATE=off, don't do anything.
//
// Part of <Error correction>.

#ifndef FORCE_DEBUG
     #define NDEBUG
#endif

#include <set>
     
#include "Basevector.h"
#include "Feudal.h"
#include "MainTools.h"
#include "ReadLocation.h"
#include "math/Functions.h"
#include "paths/KmerPath.h"
#include "paths/ReadsToPathsCoreX.h"

int main( int argc, char *argv[] )
{
     RunTime( );

     BeginCommandArguments;
     CommandArgument_String(PRE);
     CommandArgument_String(DATA);
     CommandArgument_String(RUN);
     CommandArgument_Bool_OrDefault(USE_TRUTH, False);
     CommandArgument_Int_OrDefault(K, 20);
     CommandArgument_Int_OrDefault(N, 3);
     CommandArgument_String_OrDefault(STATE, "on");
     EndCommandArguments;

     // Define directories.

     String data_dir = PRE + "/" + DATA;
     String run_dir = PRE + "/" + DATA + "/" + RUN;

     // Exit if in off state.

     if ( STATE == "off" )
     {    vecbasevector reads( run_dir + "/reads.fastb.orig" );
          vec<Bool> good( reads.size( ), False );
          BinaryWrite3( run_dir + "/reads.protected.orig", good );
          exit(0);    }

     // Get reads.

     vecbasevector reads( run_dir + "/reads.fastb.orig" );
     vecbasevector true_reads;
     if (USE_TRUTH) true_reads.ReadAll( run_dir + "/reads.true.fastb" );
     vec<read_location> readlocs;
     vec<int> readlocs_index;
     if (USE_TRUTH)
     {    READX( run_dir + "/reads.ref.locs", readlocs );
          readlocs_index.resize( reads.size( ) );
          for ( int i = 0; i < readlocs.isize( ); i++ )
          {    const read_location& rl = readlocs[i];
               readlocs_index[ rl.ReadId( ) ] = i;    }    }

     // Get genome size.

     longlong genome_size = 0;
     if (USE_TRUTH)
     {    vecbasevector genome( data_dir + "/genome.fastb" );
          for ( int i = 0; i < genome.size( ); i++ )
               genome_size += genome[i].size( );    }
     else genome_size = StringOfFile( data_dir + "/genome.size", 1 ).Int( );

     // Build paths.

     vecKmerPath paths;
     ReadsToPathsCoreY( reads, K, genome_size, paths );
     vecKmerPath paths_rc(paths);
     for ( int i = 0; i < paths_rc.size( ); i++ )
          paths_rc[i].Reverse( );
     vec<tagged_rpint> pathsdb;
     CreateDatabase( paths, paths_rc, pathsdb );

     // Define good reads.

     set<int> ids;
     vec<Bool> good( reads.size( ), True );
     for ( int id1 = 0; id1 < reads.size( ); id1++ )
     {    const KmerPath& p = paths[id1];
          for ( int i = 0; i < p.NSegments( ); i++ )
          {    for ( longlong x = p.Start(i); x <= p.Stop(i); x++ )
               {    static vec<longlong> con;
                    Contains( pathsdb, x, con );
                    ids.clear( );
                    for ( int j = 0; j < con.isize( ); j++ )
                    {    const tagged_rpint& t = pathsdb[ con[j] ];
                         int id2 = t.ReadId( );
                         if ( id2 == id1 ) continue;
                         int qseg = t.PathPos( );
                         const KmerPath& q 
                              = ( t.PathId( ) >= 0 ? paths[id2] : paths_rc[id2] );
                         if ( !ProperOverlapExt( p, q, i, qseg ) ) continue;
                         if ( i == 0 && x == p.Start(i)
                              && x == q.Stop(qseg) && qseg == q.NSegments( ) - 1 )
                         {    continue;    }
                         if ( i == p.NSegments( ) - 1 && x == p.Stop(i)
                              && x == q.Start(qseg) && qseg == 0 )
                         {    continue;    }
                         ids.insert(id2);
                         if ( (int) ids.size( ) == N ) break;    }
                    if ( (int) ids.size( ) < N )
                    {    good[id1] = False;
                         goto next_read;    }    }    }
          if ( USE_TRUTH && reads[id1] != true_reads[id1] )
          {    const read_location& rl = readlocs[ readlocs_index[id1] ];
               cout << "protecting erroneous read " << id1 << ", true start = " 
                    << rl.Contig( ) << "." << rl.Start( ) << "\n";    }
          next_read: continue;    }
     BinaryWrite3( run_dir + "/reads.protected.orig", good );    }
