/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2005) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

#ifndef FORCE_DEBUG
     #define NDEBUG
#endif

#include "Basevector.h"
#include "CoreTools.h"
#include "Feudal.h"
#include "math/HoInterval.h"
#include "Set.h"
#include "graph/Digraph.h"
#include "graph/DigraphTemplate.h"
#include "paths/UnipathNhood.h"
#include "paths/UnipathSeeds.h"
#include "paths/simulation/Placement.h"

template digraphE<fsepdev>::digraphE(digraphE<fsepdev> const&, vec<vrtx_t> const&);

// helper for debugging, implemented below.
void PrintTwoStepConnections( const digraphE<fsepdev>& FG, int v, int w, 
			      const vec<int>& copyno );

enum SeedStatus 
  { SEED_GOOD, SEED_ISOLATED, SEED_SHORT, SEED_EXCLUDED, SEED_RC, SEED_REMOVE };
string status_names[] = 
  {"=== GOOD ===", "isolated", "short", "excluded", "rc", "gap-ok-to-remove"};

void UnipathSeeds( 

     // inputs:

     const int MIN_KMERS_IN_SEED,              // smallest seed to consider
     const vec<int>& ulen,                     // unipath lengths
     const vec<Bool>& normal,                  // is a given unipath normal 
     const vec<int>& predicted_copyno,         // predicted copy number for unipaths
     const digraphE<sepdev>& G,                // graph of all normal unipaths
     const digraphE<fsepdev>& FG,              // graph of all normal unipaths
     const Bool USE_TRUTH,                     // is this simulated data?
     vecvec<placement> locs,                   // unipath locations on reference
     const Bool UNUSED_SEED_STATS,             // show details about unused seeds?
     const Bool DUMP_TRUE_LOCS,                // print true locations of used seeds?
     const int MAX_SEED_DIST,                  // must use seed if farther than this
     const vecbasevector& genome,              // the reference

     // output:

     vec<int>& seeds, 

     // optional:

     int verbosity

          )

{
     // Screen seeds: for each unipath v, eligible to be a seed, that probably
     // appears more than once in the genome, see if all copies appear to be
     // proximate on both sides to unipaths, eligible to be seeds, appearing 
     // only once.

     int nuni = ulen.size( );
     vec<Bool> excluded_seed( nuni, False );
     for ( int v = 0; v < nuni; v++ )
     {    if ( !normal[v] || predicted_copyno[v] <= 1 ) continue;
          if ( ulen[v] < MIN_KMERS_IN_SEED ) continue;
          static vec<vrtx_t> left, right;
          left.clear( ), right.clear( );
          for ( int j = 0; j < FG.To(v).isize( ); j++ )
          {    vrtx_t w = FG.To(v)[j];
               if ( !normal[w] || predicted_copyno[w] >= 2 ) continue;
               if ( ulen[w] < MIN_KMERS_IN_SEED ) continue;
               const fsepdev& s = FG.EdgeObjectByIndexTo( v, j );
               if ( s.Sep( ) > MAX_SEED_DIST ) continue;
               left.push_back(w);    }
          digraphE<fsepdev> FGleft( FG, left );
          int nleft = FGleft.ConnectedComponents( );
          if ( nleft < predicted_copyno[v] ) continue;
          for ( int j = 0; j < FG.From(v).isize( ); j++ )
          {    vrtx_t w = FG.From(v)[j];
               if ( !normal[w] || predicted_copyno[w] >= 2 ) continue;
               if ( ulen[w] < MIN_KMERS_IN_SEED ) continue;
               const fsepdev& s = FG.EdgeObjectByIndexFrom( v, j );
               if ( s.Sep( ) > MAX_SEED_DIST ) continue;
               right.push_back(w);    }
          digraphE<fsepdev> FGright( FG, right );
          int nright = FGright.ConnectedComponents( );
          if ( nright < predicted_copyno[v] ) continue;
          excluded_seed[v] = True;    }

     // Find eligible seeds.

     vec<Bool> good( nuni, False );

     // For debugging purposes: record why bad seeds are bad.
     vec<SeedStatus> seed_status(nuni, SEED_GOOD);

     for ( int v = 0; v < nuni; v++ ) {
       if ( G.From(v).empty( ) && G.To(v).empty( ) ) {
	 seed_status[v] = SEED_ISOLATED;
	 continue;
       }
       if ( ulen[v] < MIN_KMERS_IN_SEED ) {
	 seed_status[v] = SEED_SHORT;
	 continue;
       }
       if ( excluded_seed[v] ) {
	 seed_status[v] = SEED_EXCLUDED;
	 continue;
       }
       if ( USE_TRUTH && !locs[v].empty( ) ) {
	 Bool have_fw = False;
	 for ( int u = 0; u < locs[v].size( ); u++ ) {
	   const placement& p = locs[v][u];
	   if ( !p.Rc( ) ) have_fw = True;
	 }
	 if ( !have_fw )  {
	   seed_status[v] = SEED_RC;
	   continue;
	 }
       }
       good[v] = True;
     }

     // Start by choosing all "good" unipaths as seeds.
     // Then go through them in some clever order,
     // and eliminate anything whose elimination would
     // not create an unaccceptably large gap.
     vec<Bool> chosen = good;

     vec<int> seed_elim_order;

     // Elimination order possibilities: 

     // 1. Try to drop small unipaths first
     // WhatPermutation( ulen, seed_elim_order, less< pair<int,int> >(), false /* do not invert */ );

     // 2. Try to drop repetitive unipaths first, refined by lenth
     vec< pair<int,int> > copyno_and_len( ulen.size() );
     for(int i=0; i<nuni; i++)
       copyno_and_len[i] = make_pair( -predicted_copyno[i] , ulen[i] );
     WhatPermutation( copyno_and_len, seed_elim_order, less< pair<int,int> >(), false /* do not invert */ );

     for( vec<int>::iterator uni_iter = seed_elim_order.begin();
	  uni_iter != seed_elim_order.end(); uni_iter++ ) {

          int v = *uni_iter;
          if ( !good[v] ) continue;

	  // How close are the nearest chosen seeds on each side?
	  int left_gap = MAX_SEED_DIST, right_gap = MAX_SEED_DIST;
	  int left_neighbor = -1, right_neighbor = -1;
	  fsepdev sd;
	  for(int j=0; j < FG.To(v).isize(); j++) {
	    sd = FG.EdgeObjectByIndexTo(v,j);
	    if( chosen[ FG.To(v)[j] ] 
 		&& sd.Sep() > 0
		&& left_gap > int(sd.Sep()) ) {
	      left_gap = int(sd.Sep());
	      left_neighbor = FG.To(v)[j];
	    }
	  }
	  for(int j=0; j < FG.From(v).isize(); j++) {
	    sd = FG.EdgeObjectByIndexFrom(v,j);
	    if( chosen[ FG.From(v)[j] ]
 		&& sd.Sep() > 0
		&& right_gap > int(sd.Sep()) ) {
	      right_gap = int(sd.Sep());
	      right_neighbor = FG.From(v)[j];
	    }
	  }
	  int gap_if_removed = left_gap + ulen[v] + right_gap;

	  if( gap_if_removed < MAX_SEED_DIST ) {
	    chosen[v] = false;
	    seed_status[v] = SEED_REMOVE;

	    if( verbosity > 1 ) {
	      cout << "Removing seed " << v << ": graph says it's between "
		   << left_neighbor << " (gap " << left_gap
		   << ") and " << right_neighbor << " (gap " << right_gap
		   << "), gap_if_removed = " << gap_if_removed
                   << ", predicted CN = " << predicted_copyno[v];
	      if( USE_TRUTH ) {
		cout << "\n  Locs of left neighbor:  ";
		if(left_neighbor != -1 )
		  for(int j=0; j<locs[left_neighbor].size(); j++)
		    cout << locs[left_neighbor][j] << " ";
		cout << "\n  Locs of removed seed :  ";
		for(int j=0; j<locs[v].size(); j++)
		  cout << locs[v][j] << " ";
		cout << "\n  Locs of right neighbor:  ";
		if(right_neighbor != -1 )
		  for(int j=0; j<locs[right_neighbor].size(); j++)
		    cout << locs[right_neighbor][j] << " ";
	      }
	      cout << endl;
	    }

	  }
     }

     // Now record which seeds were chosen
     seeds.clear( );
     for(int v=0; v < ulen.isize(); v++)
       if(chosen[v])
	 seeds.push_back(v);

     cout << "now have " << seeds.size( ) << " seeds" << endl;
     if ( seeds.empty( ) )
     {    cout << "There are no seeds.  Something is very wrong.\n";
          cout << "Aborting.\n";
          exit(1);    }

     // Print stats on unused seeds.

     // Can't do this right now -- not keeping dist info
//      if (UNUSED_SEED_STATS)
//      {    cout << "unused seeds:\n";
//           for ( int v = 0; v < nuni; v++ )
//           {    if ( !good[v] ) continue;
//                if ( BinMember( seeds, v ) ) continue;
//                PRINT4( v, ulen[v], dist_to_left[v], 
//                     dist_to_right[v] );    }    }    

     // Dump true locations of seeds.

     if (DUMP_TRUE_LOCS)
     {    cout << "\nTrue locations of seeds:\n";
          vec< pair<placement,int> > P;
          for ( int i = 0; i < seeds.isize( ); i++ )
          {    int v = seeds[i];
               for ( int j = 0; j < locs[v].size( ); j++ )
                    P.push_back( make_pair( locs[v][j], v ) );    }
          Sort(P);
          vec< vec<ho_interval> > cov( genome.size( ) );
          for ( int i = 0; i < P.isize( ); i++ )
          {    int v = P[i].second;
               cout << v << ": " << P[i].first;
               for ( int j = 0; j < locs[v].size( ); j++ )
               {    if ( locs[v][j] == P[i].first ) continue;
                    cout << " " << locs[v][j];    }
	       if( verbosity && cov[ P[i].first.GenomeId( ) ].size() >= 1 )
		 cout << "\tpreceded by <gap " 
		      << P[i].first.pos() - (cov[P[i].first.GenomeId()].end()-1)->Stop()
		      << ">";
	       if( verbosity && cov[ P[i].first.GenomeId( ) ].size() >= 2 ) {
		 int twogap = P[i].first.pos() 
		   - (cov[P[i].first.GenomeId()].end()-2)->Stop();
		 cout << "  <2-gap " << twogap
		      << (twogap < MAX_SEED_DIST  ? "> ***" : ">");
		 if( verbosity && twogap < MAX_SEED_DIST ) {
		   if( i>2 & P[i-2].first.GenomeId() == P[i].first.GenomeId() ) {

		     int back1 = P[i-1].second, back2 = P[i-2].second;
		     cout << "\n\t" << back2 << "->" << back1;
		     int bpTo = BinPosition( FG.To(back1), back2 );
		     if( bpTo == -1 ) {
		       cout << " not in graph ( ";
		       // Is there a two-step connection?
		       PrintTwoStepConnections( FG, back2, back1,predicted_copyno );
		       cout << ")";
		     }
		     else
		       cout << " sep=" << FG.EdgeObjectByIndexTo(back1,bpTo).Sep()
			    << ", dev=" << FG.EdgeObjectByIndexTo(back1,bpTo).Dev();

		     cout << "\n\t" << back1 << "->" << v;
		     int bpFrom = BinPosition( FG.From(back1), v );
		     if( bpFrom == -1 ) {
		       cout << " not in graph ( ";
		       // Is there a two-step connection?
		       PrintTwoStepConnections( FG, back1, v, predicted_copyno );
		       cout << ")";
		     }
		     else
		       cout << " sep=" << FG.EdgeObjectByIndexFrom(back1,bpFrom).Sep()
			    << ", dev=" << FG.EdgeObjectByIndexFrom(back1,bpFrom).Dev();
		     if( bpTo != -1 && bpFrom != -1 )
		       cout << ";  sep+len+sep = " <<
			 FG.EdgeObjectByIndexTo(back1,bpTo).Sep() 
			 + ulen[back1]
			 + FG.EdgeObjectByIndexFrom(back1,bpFrom).Sep();
		   }
		 }
	       }
               cout << endl;    
	       cov[ P[i].first.GenomeId( ) ].push_back(
                    ho_interval( P[i].first.pos( ), P[i].first.Pos( ) ) );

	  }
          cout << "\nLargest uncovered stretches:\n";
          vec<ho_interval> un;
          vec<int> un_g;
          vec<int> un_length;
          for ( int g = 0; g < genome.size( ); g++ )
          {    static vec<ho_interval> ung;
               Uncovered( genome[g].size( ), cov[g], ung );
               un.append(ung);
               for ( int j = 0; j < ung.isize( ); j++ )
               {    un_g.push_back(g);
                    un_length.push_back( ung[j].Length( ) );    }    }
          ReverseSortSync( un_length, un_g, un );
          vec< vec<String> > rows;
          for ( int j = 0; j < Min( un.isize( ), 20 ); j++ )
          {    vec<String> row;
               row.push_back( ToString( un_length[j] ) );
               row.push_back( ToString( un_g[j] ) + "." + ToString( un[j].Start( ) )
                    + "-" + ToString( un[j].Stop( ) ) );
               rows.push_back(row);    }
          PrintTabular( cout, rows, 2, "rl" );
          flush(cout);    }    

     if ( USE_TRUTH && verbosity > 2 ) {
       // Print seeds and statuses in order along the genome.
       cout << "\nWalking along the reference:" << endl;
       vec< pair<placement,int> > P;
       for ( int v = 0; v < nuni; v++ )
	 for ( int j = 0; j < locs[v].size( ); j++ )
	   if( verbosity>3
	       || seed_status[v]==SEED_GOOD 
	       || seed_status[v]==SEED_REMOVE )
	     P.push_back( make_pair( locs[v][j], v ) );
       Sort(P);
       for ( int p=0; p<P.isize(); p++ )
	 cout << P[p].first << '\t'
	      << P[p].second << '\t'
	      << locs[ P[p].second ].size() << " locs\t"
	      << status_names[seed_status[P[p].second]] << '\n';
       cout << endl;
     }
}


// helper for debugging:
void PrintTwoStepConnections( const digraphE<fsepdev>& FG, int v, int w, 
			      const vec<int>& copyno ) {
  for(int pass=0; pass<4; pass++) {
    const vec<vrtx_t>& vnbr = (pass & 01) ? FG.To(v) : FG.From(v);
    const String varrow  = (pass & 01) ? "<-"     : "->";
    const vec<vrtx_t>& wnbr = (pass & 02) ? FG.To(w) : FG.From(w);
    const String warrow  = (pass & 02) ? "->"     : "<-";
    vec<int> midpts;
    Intersection( vnbr, wnbr, midpts );
    for(int i=0; i<midpts.isize(); i++)
      cout << v << "/" << copyno[v] << varrow 
	   << midpts[i] << "/" << copyno[midpts[i]]
	   << warrow << w << "/" << copyno[w] << " ";
  }
}
