/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2006) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

/*
   Program: PathsToLocs
   
   Given some kmer paths from a _simulated_ read set, find all their
   possible placements on the genome.  There should be NO gaps in anything.
*/

#ifndef FORCE_DEBUG
     #define NDEBUG
#endif

#include "FeudalTemplate.h"
#include "MainTools.h"
#include "ParseSet.h"
#include "graph/Digraph.h"
#include "graphics/Whiteboard.h"
#include "paths/KmerPath.h"
#include "paths/Unipath.h"
#include "paths/simulation/GenomePlacement.h"


int main( int argc, char *argv[] )
{
     RunTime( );

     BeginCommandArguments;
     CommandArgument_String(PRE);
     CommandArgument_String(DATA);
     CommandArgument_String(RUN); 
     CommandArgument_String_OrDefault(PATHS, ""); 
     CommandArgument_Int(K);
     CommandArgument_String_OrDefault(SAVE_PLACEMENTS_TO, "");
     CommandArgument_String_OrDefault(LOAD_PLACEMENTS_FROM, "");
     CommandArgument_Bool_OrDefault(UNIPATH, False);
     CommandArgument_Bool_OrDefault(FW_ONLY, False);
     CommandArgument_String_OrDefault(COPY_PDF_FILE, "");
     CommandArgument_Int_OrDefault(MATCH_LEN_PERCENT, (UNIPATH ? 75 : 100) );
     CommandArgument_Bool_OrDefault(SHOW_PLACEMENTS, !UNIPATH);
     CommandArgument_Bool_OrDefault(SHOW_FW_ONLY, True);
     CommandArgument_String_OrDefault(SHOW_THESE_CONTIGS_ONLY, "");
     CommandArgument_Int_OrDefault(SHOW_WINDOW_BEGIN, -1);
     CommandArgument_Int_OrDefault(SHOW_WINDOW_END, -1);
     CommandArgument_Bool_OrDefault(KMER_COORDS, False);
     CommandArgument_Int_OrDefault(MIN_KMERS, 1);
     CommandArgument_Bool_OrDefault(UNIPATH_PLOT, False);
     CommandArgument_Bool_OrDefault(PRINT_ADJACENCY_GRAPH, True);
     EndCommandArguments;

     if( MATCH_LEN_PERCENT < 50 )
       FatalErr("MATCH_LEN_PERCENT must be at least 50%");

     if( SAVE_PLACEMENTS_TO.nonempty() && LOAD_PLACEMENTS_FROM.nonempty() )
       FatalErr("Specify at most one of SAVE_PLACEMENTS_TO or LOAD_PLACEMENTS_FROM filenames");

     String datadir = PRE + "/" + DATA;
     String run_dir = PRE + "/" + DATA + "/" + RUN;

     if( PATHS == "" && !UNIPATH ) {
       cout << "You must specify PATHS unless in UNIPATH mode" << endl;
       exit(1);
     }
     
     if( UNIPATH && PATHS == "" )
       PATHS = run_dir + "/reads.unipaths.k" + ToString(K);

     vecKmerPath paths(PATHS);

     vecKmerPath genome_paths;
     String genome_file = "/genome.paths.k" + ToString(K);
     if( IsRegularFile( run_dir + genome_file ) ) {
       genome_paths.ReadAll( run_dir + genome_file );
     }
     else if( IsRegularFile( datadir + genome_file ) ) {
       cout << "Reading global genome.paths; I hope the kmer numbering is the same!" 
	    << endl;
       genome_paths.ReadAll( datadir + genome_file );
     }
     else {
       FatalErr("No genome.paths file found!");
     }
     
     int numNoncanonicalPaths = 0;
     for ( int i = 0; i < genome_paths.size(); ++i ) {
       KmerPath canonicalPath = genome_paths[i];
       canonicalPath.Canonicalize();
       if ( ! ( canonicalPath == genome_paths[i] ) )
         ++numNoncanonicalPaths;
     }
     if ( numNoncanonicalPaths > 0 ) {
       cout << "Warning: " << numNoncanonicalPaths << " genome paths are not canonicalized,"
            << " so some alignments may not be found." << endl;
     }

     vec<big_tagged_rpint> genome_db;
     String genome_dbfile = "/genome.pathsdb_big.k" + ToString(K);
     if ( IsRegularFile( run_dir + genome_dbfile ) ) {
       BinaryRead2( run_dir + genome_dbfile, genome_db );
     }
     else {
       genome_db.reserve( genome_paths.rawsize( ) );
       for ( int i = 0; i < genome_paths.size( ); i++ )
         genome_paths[i].AppendToDatabase( genome_db, i );
       Prepare(genome_db);
     }

     vec< vec<int> > genome_len( genome_paths.size( ) );
     for ( int i = 0; i < genome_paths.size( ); i++ )
     {    const KmerPath& p = genome_paths[i];
          genome_len[i].push_back(0);
          for ( int j = 0; j < p.NSegments( ); j++ )
               genome_len[i].push_back(
                    genome_len[i].back( ) + p.Segment(j).Length( ) );    }

     vec<genome_placement> locs;
     vec<int> actual_copy_number(paths.size(), 0);
     if( LOAD_PLACEMENTS_FROM.nonempty() ) {
       BinaryRead2( LOAD_PLACEMENTS_FROM, locs );
       for( int i=0; i < locs.isize(); i++ )
	 actual_copy_number[ locs[i].GetReadId() ] = locs[i].GetCopyNumber();
     }
     else {
       for ( int i = 0; i < paths.size( ); i++ )
       {  static KmerPath p;
          p = paths[i];
	  int pkmers = p.KmerCount( );
          if ( pkmers < MIN_KMERS ) continue;
          for ( int pass = 1; pass <= (FW_ONLY?1:2); pass++ )
          {    if ( pass == 2 ) p.Reverse( );
               static vec<longlong> places;
	       KmerPathLoc pmid = p.Begin(), pmid_copy;
	       pmid.IncrementHaltAtGap( pkmers/2 );
               longlong middle_kmer = pmid.GetKmer();
               Contains( genome_db, middle_kmer, places );
               for ( int j = 0; j < places.isize( ); j++ )
               {    const big_tagged_rpint& t = genome_db[ places[j] ];
                    int id = t.PathId( ), pp = t.PathPos( );
                    if ( id < 0 ) continue;
		    KmerPathLoc matchC(genome_paths[id],pp), matchL, matchR;
		    matchC.SetKmer(middle_kmer);
                    // These scans may fail if the genome paths are
                    // not canonicalized (which may not be possible
                    // for genome-based datasets).
                    ScanLeftPerfectMatch( pmid_copy = pmid, matchL = matchC );
                    ScanRightPerfectMatch( pmid_copy = pmid, matchR = matchC );

		    int matchlen = KmersInInterval( matchL, matchR );

		    if( (100 * matchlen)/pkmers < MATCH_LEN_PERCENT ) continue;

                    locs.push_back( genome_placement( i, pkmers, id,
			 matchL.GetLoc() + genome_len[id][matchL.GetIndex()],
			 matchR.GetLoc() + genome_len[id][matchR.GetIndex()] + K-1,
			 pass == 2, places.size() ) );
	       }
	  }
       }

       Sort(locs);

       for ( int i = 0; i < locs.isize( ); i++ )
	 ++actual_copy_number[ locs[i].GetReadId() ];
       for ( int i = 0; i < locs.isize( ); i++ )
	 locs[i].SetCopyNumber( actual_copy_number[ locs[i].GetReadId( ) ] );

       if( SAVE_PLACEMENTS_TO.nonempty() ) {
	 BinaryWrite2( SAVE_PLACEMENTS_TO, locs );
       }
     }

     // Compute coverage stats.

     vec< vec<Bool> > hit( genome_paths.size( ) );
     for ( int i = 0; i < genome_paths.size( ); i++ )
          hit[i].resize( genome_paths[i].KmerCount( ) + K-1, False );
     for ( int i = 0; i < locs.isize( ); i++ )
     {    const genome_placement& p = locs[i];
          for ( int j = p.GetStartOnGenome( ); j <= p.GetEndOnGenome( ); j++ )
               hit[ p.GetGenomeId( ) ][j] = True;    }
     longlong total = 0, total_hit = 0;
     int gaps = 0;
     for( int i = 0; i < genome_paths.size( ); i++ )
     {    total += genome_paths[i].KmerCount( );
          total_hit += Sum( hit[i] );
          for ( int j = 0; j < hit[i].isize( ); j++ )
          {    if ( !hit[i][j] ) 
               {    ++gaps;
                    int m;
                    for ( m = j+1; m < hit[i].isize( ); m++ )
                         if ( hit[i][m] ) break;
                    cout << "gap at " << i << "." << j << "-" << m
                         << " of " << hit[i].size( ) << "\n";
                    j = m;    }    }    }
     cout << "\nThere are " << paths.size( ) << " unipaths, having " << gaps
          << " gaps and covering " << PERCENT_RATIO( 6, total_hit, total ) 
          << " of the genome.\n\n";

     // Compute N50 placed unipath size.

     vec<int> ulen( locs.size( ) );
     for ( int i = 0; i < locs.isize( ); i++ )
          ulen[i] = locs[i].GetKmerCount( ) + K - 1;
     Sort(ulen);
     if ( ulen.nonempty( ) )
          cout << "N50 placed unipath size = " << N50(ulen) << "\n\n";

     // Show placements.

     if( SHOW_PLACEMENTS ) {
       vec<int> to_show;
       if( SHOW_THESE_CONTIGS_ONLY.nonempty() )
	 ParseIntSet( SHOW_THESE_CONTIGS_ONLY, to_show );
       for ( int i = 0; i < locs.isize( ); i++ )
	 if( (locs[i].IsFw() || !SHOW_FW_ONLY)
	     &&
	     (to_show.empty() || BinMember(to_show,locs[i].GetGenomeId()))
	     &&
	     (SHOW_WINDOW_BEGIN==-1 || (SHOW_WINDOW_BEGIN <= locs[i].GetEndOnGenome() &&
					locs[i].GetStartOnGenome() <= SHOW_WINDOW_END)) ) {
           if ( KMER_COORDS ) {
             genome_placement& p = locs[i];
             cout << "path " << setiosflags(ios::fixed) << setw(6) << p.GetReadId()
                  << resetiosflags(ios::fixed) << " (" << p.GetKmerCount() << " kmers)"
                  << " --> " << p.GetGenomeId() << "." << p.GetStartOnGenome() 
                  << "-" << p.GetEndOnGenome() - K + 2 << ( p.IsRc() ? " (rc)" : " (fw)" ) 
                  << " [" << p.GetCopyNumber() << " places]"
                  << "\n";
           }
           else
             cout << locs[i];
         }
     }


     if( UNIPATH ) {
       // How good are the probabilities we assigned to copy numbers?
       // This requires that reads.unipaths.predicted_count.k
       // has already been generated (by UnipathCoverage).

       String copy_pdf_file( COPY_PDF_FILE );
       if( copy_pdf_file.empty() )
	 copy_pdf_file = run_dir + "/reads.unipaths.predicted_count.k" + ToString(K);

       if( ! IsRegularFile( copy_pdf_file ) ) {
	 cout << "You can only CheckPredictedUnipathCoverage if the file\n  "
	      << copy_pdf_file
	      << "\n(or specify COPY_PDF_FILE= ) has been generated (by UnipathCoverage)." << endl;
	 ForceAssert( IsRegularFile( copy_pdf_file ) );
       }

       vecvec< pair<int,double> > unipath_copy_pdf( copy_pdf_file );
       ForceAssertEq( actual_copy_number.isize(), unipath_copy_pdf.size() );


       // Make bins based on predicted probabilities, and count how
       // often those predictions were made and were correct.
       int num_bins = 20, bin, br;
       vec<longlong> numer(num_bins+1,0), denom(num_bins+1,0);
       // Break down error rates into predictions of 0, 1, 2, 3+
       vec< vec<longlong> > numer_break(4,numer);
       vec< vec<longlong> > denom_break(4,denom);

       for( int i=0; i < paths.size(); i++ ) {
	 int pkmers = paths[i].KmerCount();
	 if( pkmers < MIN_KMERS ) continue;
	 const serfvec< pair<int,double> >& pdf = unipath_copy_pdf[i];
	 int c = actual_copy_number[i];
	 double predict_max_p=0.0, predict_true_p=0.0;
	 for( int j=0; j < pdf.size(); j++ ) {
	   predict_max_p = Max( predict_max_p, pdf[j].second );
	   bin = int(floor( num_bins * pdf[j].second ));
	   br = ( pdf[j].first > 2 ? 3 : pdf[j].first );
	   denom[bin]++; denom_break[br][bin]++;
	   if( pdf[j].first == c ) {
	     numer[bin]++; numer_break[br][bin]++;
	     predict_true_p = pdf[j].second;
	   }
	 }
// 	 if( predict_max_p > 10*predict_true_p ) {
// 	   cout << "Unipath " << i << ", " << pkmers << " kmers"
// 		<< ": actually " << c << ", predicted ";
// 	   for( int j=0; j < pdf.size(); j++ )
// 	     cout << pdf[j].first << "(" << setprecision(3) << pdf[j].second << ") ";
// 	   cout << endl;
// 	 }
       }

       cout << "\n\n bin accuracy pred=0 pred=1 pred=2 pred3+  tot#predictions\n";
       for(bin = 0; bin <= num_bins; bin++) {
	 cout << setw(3) << (100 * bin) / num_bins << "%   ";

	 if( denom[bin] == 0 )
	   cout << "  --    ";
	 else
	   cout << setw(3) << (100*numer[bin])/denom[bin] << "%    ";

	 for(int br=0; br < denom_break.isize(); br++) {
	   if( denom_break[br][bin] == 0 )
	     cout << "  --   ";
	   else if( denom_break[br][bin] < 10 )
	     cout << " " << numer_break[br][bin] 
		  << "/" << denom_break[br][bin] << "   ";
	   else if( denom_break[br][bin] < 100 && numer_break[br][bin]<10 )
	     cout << numer_break[br][bin] 
		  << "/" << denom_break[br][bin] << "   ";
	   else
	     cout << setw(3) << (100*numer_break[br][bin])/denom_break[br][bin] 
		  << "%   ";
	 }
	 cout << "  " << denom[bin] << "\n";
       }
       cout << endl;

       if( UNIPATH_PLOT ) {
	 using namespace ns_whiteboard;
	 whiteboard board;

	 const float x_scale = 0.01;
	 const float y_scale = 25.0;
	 const float y_offset = 10 * y_scale;
	 
	 // A baseline for each contig:
	 float x_max=0, y_max=0;

	 for( int i=0; i<genome_len.isize(); i++ ) {
	   float x1 = x_scale * genome_len[i].back();
	   board.Add(new line( xy_coords( 0, i*y_offset ),
			       xy_coords( x1, i * y_offset ),
			       2.0, black ));
	   x_max = Max(x_max, x1);
	 }

	 // For each placement of a unipath, plot its true and guessed copy number.
	 for( vec<genome_placement>::iterator pl = locs.begin(); pl != locs.end(); pl++ ) {
	   int true_copy_no = pl->GetCopyNumber();
	   const serfvec< pair<int,double> >& pdf = unipath_copy_pdf[pl->GetReadId()];
	   float guess_avg = 0;
	   for(int j=0; j<pdf.size(); j++)
	     guess_avg += pdf[j].first * pdf[j].second;
	   
	   float
	     x0 = x_scale * pl->GetStartOnGenome(), 
	     x1 = x_scale * pl->GetEndOnGenome(),
	     y_true = y_offset * pl->GetGenomeId() + y_scale * true_copy_no,
	     y_guess = y_offset * pl->GetGenomeId() + y_scale * guess_avg;

	   board.Add(new line( xy_coords( x0, y_guess ),
			       xy_coords( x1, y_guess ),
			       2.0, red ));
	   board.Add(new line( xy_coords( x0, y_true ),
			       xy_coords( x1, y_true ),
			       1.0, green ));
	   y_max = Max(y_max, Max(y_true, y_guess));
	 }

	 String ps_filename = "unipath_copy_number.ps";
	 Ofstream( ps_file, ps_filename );
	 ps_display ps_out( ps_file, x_max, y_max, 10.0 );
	 board.DisplayOn( &ps_out );
	 cout << "\nPostscript plot saved in " << ps_filename << endl;

	 board.DeletePointers();

       } // if( UNIPATH_PLOT )

     } // if( UNIPATH )

     // Compute and print the unipath adjacency graph and predicted copy numbers.

     if (PRINT_ADJACENCY_GRAPH)
     {    String KS = ToString(K);
          vecKmerPath paths( run_dir + "/reads.paths.k" + ToString(K) ); 
          vecKmerPath paths_rc( run_dir + "/reads.paths_rc.k" + ToString(K) ); 
          BREAD2( run_dir + "/reads.pathsdb.k" + KS, vec<tagged_rpint>, pathsdb );
          vecKmerPath unipaths( run_dir + "/reads.unipaths.k" + KS );
          BREAD2( run_dir + "/reads.unipathsdb.k" + KS, vec<tagged_rpint>, 
               unipathsdb );
          vecvec< pair<int,double> > cp;
          cp.ReadAll( run_dir + "/reads.unipaths.predicted_count.k" + ToString(K) );
          digraph A;
          BuildUnipathAdjacencyGraph( paths, paths_rc, pathsdb, unipaths,
               unipathsdb, A );
          cout << "\nAdjacency graph and predicted copy numbers:\n";
          for ( int v = 0; v < A.N( ); v++ )
          {    cout << "\n" << v << ":";
               for ( int j = 0; j < cp[v].size( ); j++ )
               {    cout << " " << cp[v][j].first << "(" << setprecision(3)
                         << cp[v][j].second << ")";    }
               cout << "\n" << v << " <--";
               for ( int j = 0; j < A.To(v).isize( ); j++ )
                    cout << " " << A.To(v)[j];
               cout << "\n";
               cout << v << " -->";
               for ( int j = 0; j < A.From(v).isize( ); j++ )
                    cout << " " << A.From(v)[j];
               cout << "\n";    }    }

}
