/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2005) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

// Unipather.  Consider the reads, always including the reverse complements in the
// set.  We assume that the reads have no gaps.  Divide all kmers in the read set 
// into kmer paths ("unipaths"), such that:
// - each kmer is in exactly one unipath;
// - if two kmers are adjacent in a unipath then they are adjacent in some read; 
// - unipaths contain no branch points (meaning that if two kmers xy are adjacent
//   in a unipath, then we never have xy' or x'y in a read for some other kmers 
//   x' or y');
// - otherwise, unipaths are maximal.
//
// This construction appears closely related to the construction of unitigs 
// described in Myers et. al. 2000.
//
// Generates: reads.unipaths.k*, reads.unipathsdb.k*, where * is K.
//
// The logic of how the unipather works could probably be improved substantially,
// so as to speed it up.  In particular, two things to look at are 
// (a) redundancy: the code is designed to rebuild a given unipath over and over,
// discarding duplicates on the fly;
// (b) multiple searches of pathsdb, some of which are unnecessary.
//
// If VALIDATE=True, guarantee correctness by checking:
// - the kmers which appear in the unipaths are exactly the kmers which appear in
//   the read set, and no kmer appears more than once in the unipaths;
// - if a read shares a kmer with a unipath, then the partial alignment defined
//   by the shared kmer extends to a proper alignment;
// - every unipath is covered by overlapping reads;
// - if x is an end kmer of a unipath, then either the number of kmers adjacent to
//   x (off the end of the unipath) is not one, or else the unique adjacent kmer y
//   has a kmer adjacent to it (towards the unipath), other than x.
// Needless to say, the validator is itself sufficiently complicated that there 
// could be some uncertainty as to its correctness.
//
// Complication.  At least in artificial data sets, there can be circles without
// branch points.  These are arbitrarily broken.  Examples of this type have not
// been tested with VALIDATE=True.
//
// If SIM=True, generate a vecvec<placement>, giving locations on genome.fastb,
// in the file reads.unipaths.k*.locs.  If in addition SHOW_PLACEMENTS=True,
// show the placements.

#ifndef FORCE_DEBUG
     #define NDEBUG
#endif

#include "Alignment.h"
#include "math/Functions.h"
#include "math/HoInterval.h"
#include "MainTools.h"
#include "pairwise_aligners/PerfectAligner.h"
#include "paths/ImproperMerge.h"
#include "paths/KmerBaseBroker.h"
#include "paths/KmerPath.h"
#include "paths/Unipath.h"
#include "paths/simulation/Placement.h"

int main( int argc, char *argv[] )
{
     RunTime( );

     BeginCommandArguments;
     CommandArgument_String(PRE);
     CommandArgument_String(DATA);
     CommandArgument_String(RUN);
     CommandArgument_Int(K);
     CommandArgument_Bool_OrDefault(VALIDATE, False);
     CommandArgument_Bool_OrDefault(SIM, False);
     CommandArgument_Bool_OrDefault(SHOW_PLACEMENTS, False);
     EndCommandArguments;

     String data_dir = PRE + "/" + DATA;
     String run_dir = PRE + "/" + DATA + "/" + RUN;
     String KS = ToString(K);

     // Read in data.

     cout << Date( ) << ": loading data" << endl;
     vecKmerPath paths( run_dir + "/reads.paths.k" + ToString(K) );
     vecKmerPath paths_rc( run_dir + "/reads.paths_rc.k" + ToString(K) );    
     BREAD2( run_dir + "/reads.pathsdb.k" + KS, vec<tagged_rpint>, pathsdb );

     // Build unipaths.

     vecKmerPath unipaths;
     vec<tagged_rpint> unipathsdb;
     String unipaths_file = run_dir + "/reads.unipaths.k" + KS;
     Unipath( paths, paths_rc, pathsdb, unipaths, unipathsdb, True, unipaths_file );
     BinaryWrite3( run_dir + "/reads.unipathsdb.k" + KS, unipathsdb );    
     cout << Date( ) << ": Found " << unipaths.size( ) << " unipaths." << endl;

     // Find locations of unipaths on reference.

     if (SIM)
     {    vecbasevector seqs;
          KmerBaseBroker kbb( run_dir, K );
          for ( int i = 0; i < unipaths.size( ); i++ )
               seqs.push_back_reserve( kbb.ToSequence( unipaths[i] ).Seq(0) );
          seqs.ReadAll( data_dir + "/genome.fastb", True );
          vec<alignment_plus> aligns;
          PerfectAligner( K, PerfectAligner::findProperOnly, &cout ).Align( seqs, aligns, unipaths.size( ) );
          vec< vec<placement> > locs0( unipaths.size( ) );
          for ( int i = 0; i < aligns.isize( ); i++ )
          {    const alignment_plus& ap = aligns[i];
               if ( !ap.Rc2( ) )
               {    locs0[ ap.Id1( ) ].push_back( placement( ap.Id2( ), 
                         ap.a.pos2( ), ap.a.Pos2( ), ap.Rc2( ) ) );    }
               else
               {    int g = seqs[ unipaths.size( ) + ap.Id2( ) ].size( );
                    locs0[ ap.Id1( ) ].push_back( placement( ap.Id2( ), 
                         g - ap.a.Pos2( ), g - ap.a.pos2( ), ap.Rc2( ) ) );    }    }
          if (SHOW_PLACEMENTS) cout << "\n";
          vecvec<placement> locs;
          locs.Reserve( aligns.size( ), unipaths.size( ) );
          for ( int i = 0; i < locs0.isize( ); i++ )
          {    Sort( locs0[i] );
               static serfvec<placement> v;
               v.clear( );
               for ( int j = 0; j < locs0[i].isize( ); j++ )
               {    if (SHOW_PLACEMENTS)
                    {    const placement& p = locs0[i][j];
                         cout << "unipath " << i << " (l=" 
                              << unipaths[i].KmerCount( ) << ") placed at " 
                              << p.GenomeId( ) << "." << p.pos( ) << "-" 
                              << p.Pos( ) << " (" << ( p.Fw( ) ? "fw" : "rc" ) 
                              << ")\n";    }
                    v.push_back( locs0[i][j] );    }
               locs.push_back(v);    }
          locs.WriteAll( run_dir + "/reads.unipaths.k" + KS + ".locs" );    }

     // Validate unipaths.

     if ( !VALIDATE ) exit(0);
     longlong n_kmers = 0, total_kmers = 0;
     for ( int i = 0; i < unipaths.size( ); i++ )
          n_kmers += unipaths[i].KmerCount( );
     for ( int i = 0; i < unipathsdb.isize( ); i++ )
          total_kmers += unipathsdb[i].Length( );
     ForceAssertEq( n_kmers, total_kmers );
     vec< pair<longlong, longlong> > paths_cat[2];
     for ( int pass = 0; pass < 2; pass++ )
     {    longlong from = -1, to = -2;
          const vec<tagged_rpint>& db = ( pass == 0 ? pathsdb : unipathsdb );
          for ( int i = 0; i < db.isize( ); i++ )
          {    const tagged_rpint& t = db[i];
               if ( t.Start( ) > to + 1 )
               {    if ( to >= 0 ) paths_cat[pass].push_back( make_pair(from, to) );
                    from = t.Start( ), to = t.Stop( );    }
               else to = Max( to, t.Stop( ) );    }
          if ( to > paths_cat[pass].back( ).second )
               paths_cat[pass].push_back( make_pair( from, to ) );    }
     ForceAssert( paths_cat[0] == paths_cat[1] );
     vec<longlong> locs;
     for ( int i = 0; i < unipaths.size( ); i++ )
     {    const KmerPath& p = unipaths[i];
          static vec<ho_interval> covered;
          covered.clear( );
          for ( int j = 0; j < p.NSegments( ); j++ )
          {    Contains( pathsdb, p.Segment(j), locs );
               for ( int u = 0; u < locs.isize( ); u++ )
               {    const tagged_rpint& t = pathsdb[ locs[u] ];
                    int id = t.ReadId( );
                    static vec<ImproperMerger> aligns;
                    const KmerPath& q =
                         ( t.PathId( ) >= 0 ? paths[id] : paths_rc[id] );
                    ImproperMergePaths( p, q, j, t.PathPos( ), aligns );
                    ForceAssertEq( aligns.size( ), 1 );
                    const ImproperMerger& m = aligns[0];
                    ForceAssert( m.left_end1.atBegin( ) || m.left_end2.atBegin( ) );
                    ForceAssert( m.right_end1.atEnd( ) || m.right_end2.atEnd( ) );
                    int start = DistMin( p.Begin( ), m.left_end1 );
                    int stop = DistMin( p.Begin( ), m.right_end1 ) + 1;
                    if ( start > 0 )
                    {    if ( start == stop - 1 ) continue;
                         ++start;    }
                    covered.push_back( ho_interval( start, stop ) );    }    }
          ForceAssertEq( TotalCovered(covered), p.KmerCount( ) );
          longlong first = p.Segment(0).Start( );
          static vec<longlong> before_first, after_before_first;
          before_first.clear( ), after_before_first.clear( );
          Contains( pathsdb, first, locs );
          for ( int u = 0; u < locs.isize( ); u++ )
          {    const tagged_rpint& t = pathsdb[ locs[u] ];
               int id = t.ReadId( ), pp = t.PathPos( );
               const KmerPath& q = ( t.PathId( ) >= 0 ? paths[id] : paths_rc[id] );
               if ( q.Segment(pp).Start( ) < first ) before_first.push_back(first-1);
               else if ( pp > 0 ) 
               {    before_first.push_back( q.Segment(pp-1).Stop( ) );    }    }
          UniqueSort(before_first);
          if ( before_first.size( ) == 1 )
          {    Contains( pathsdb, before_first[0], locs );
               for ( int u = 0; u < locs.isize( ); u++ )
               {    const tagged_rpint& t = pathsdb[ locs[u] ];
                    int id = t.ReadId( ), pp = t.PathPos( );
                    const KmerPath& q 
                         = ( t.PathId( ) >= 0 ? paths[id] : paths_rc[id] );
                    if ( q.Segment(pp).Stop( ) > before_first[0] )
                         after_before_first.push_back( before_first[0] + 1 );
                    else if ( pp < q.NSegments( ) - 1 )
                    {    after_before_first.push_back( 
                              q.Segment(pp+1).Start( ) );    }    }
               UniqueSort(after_before_first);
               if ( after_before_first.size( ) <= 1 )
               {    cout << "unipath " << i << " terminates prematurely on left:\n";
                    cout << unipaths[i] << "\n";
                    exit(1);    }    }
          longlong last = p.LastSegment( ).Stop( );
          static vec<longlong> after_last, before_after_last;
          after_last.clear( ), before_after_last.clear( );
          Contains( pathsdb, last, locs );
          for ( int u = 0; u < locs.isize( ); u++ )
          {    const tagged_rpint& t = pathsdb[ locs[u] ];
               int id = t.ReadId( ), pp = t.PathPos( );
               const KmerPath& q = ( t.PathId( ) >= 0 ? paths[id] : paths_rc[id] );
               if ( q.Segment(pp).Stop( ) > last ) after_last.push_back(last+1);
               else if ( pp < q.NSegments( ) - 1 )
               {    after_last.push_back( q.Segment(pp+1).Start( ) );    }    }
          UniqueSort(after_last);
          if ( after_last.size( ) == 1 )
          {    Contains( pathsdb, after_last[0], locs );
               for ( int u = 0; u < locs.isize( ); u++ )
               {    const tagged_rpint& t = pathsdb[ locs[u] ];
                    int id = t.ReadId( ), pp = t.PathPos( );
                    const KmerPath& q 
                         = ( t.PathId( ) >= 0 ? paths[id] : paths_rc[id] );
                    if ( q.Segment(pp).Start( ) < after_last[0] )
                         before_after_last.push_back( after_last[0] - 1 );
                    else if ( pp > 0 )
                    {    before_after_last.push_back( 
                              q.Segment(pp-1).Stop( ) );    }    }
               UniqueSort(before_after_last);
               if ( before_after_last.size( ) <= 1 )
               {    cout << "unipath " << i << " terminates prematurely on right:\n";
                    cout << unipaths[i] << "\n";
                    exit(1);    }    }    }    }
