/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2006) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

/**
   Program: FindStrongKmers

   Identify <strong kmers> in the reads -- those kmers that we believe are genomic.

   Program parameters:

      PRE - the <WGA data dir>
      DATA - the data directory for our ALLPATHS project
      RUN - the directory for this program run.
      Ks - the <kmer shapes> to use; everything is done for each kmer shape in turn.
      READS_IN - the original, unedited reads (we want to use strong kmers to edit them!)
      EXCLUDE_PAIRED - whether to exclude paired reads
      USE_TRUSTED - also use <trusted bases> information to determine which kmers are strong,
         in addition to using kmer frequency.
      ALL_STRONG - if True, then all kmers are considered strong regardless of frequency
      REBUILD -  if REBUILD is True, always rebuild the kmer frequency histograms
          from the data, i.e. don't load them in from disk if they exist.


   Input files:

      <.trusted>  -  which bases are trusted in each read

   Output files:

      <.trusted.Ks> - trusted kmers defined as, the frequency of kmers having
         trusted occurrences (an occurrence where all bases are trusted);
	 Output only if USE_TRUSTED is True.


      <.nonunique.kS> - strong kmers defined by their frequency
         (to the right of the first local minimum of the kmer frequency histogram).
       

   See also: <FindHiQualBasesInReads>, <EvaluateStrongKmers>.

   @file
*/

#ifndef FORCE_DEBUG
  #define NDEBUG
#endif

#include "MainTools.h"

#include "math/Functions.h"
#include "KmerShape.h"
#include "ReadPairing.h"
#include "Bitvector.h"

#include "kmer_freq/WriteKmerFrequencies.h"
#include "kmer_freq/KmerFrequencyTable.h"
#include "kmer_freq/TransformKmerShortMap.h"

int main( int argc, char *argv[] )
{
  RunTime( );
  
  BeginCommandArguments;
  CommandArgument_String(PRE);
  CommandArgument_String(DATA); 
  CommandArgument_String(RUN); 
  CommandArgument_KShapes2(K, Ks);
  CommandArgument_String(READS_IN);
  CommandArgument_Bool_OrDefault(GC_BIASED, False);
  CommandArgument_Bool_OrDefault(EXCLUDE_PAIRED, False);
  CommandArgument_Bool_OrDefault(USE_TRUSTED, False);
  CommandArgument_Bool_OrDefault(REBUILD, False);
  CommandArgument_Bool_OrDefault(ALL_STRONG, False);
  EndCommandArguments;

  String run_dir = PRE + "/" + DATA + "/" + RUN;

  vecbasevector allReads( run_dir + "/" + READS_IN );

  vecbitvector trusted;

  if (USE_TRUSTED)
    trusted.ReadAll(run_dir + "/" + READS_IN + ".trusted");

  // Use Paired Reads?
  if (EXCLUDE_PAIRED) {
    String pairtoFile = run_dir + "/reads.pairto";
    if ( IsRegularFile(pairtoFile) || IsRegularFile(pairtoFile + "b") ) {
      vec<read_pairing> pairs;
      ReadPairsFile( pairtoFile, pairs );

      vec<int> readIds(pairs.isize() * 2);
      int read_count = 0;
      for(int i = 0; i < pairs.isize(); ++i)
	if (pairs[i].Alive()) {
	  readIds[read_count++] = pairs[i].id1;
	  readIds[read_count++] = pairs[i].id2;
	}
      readIds.resize(read_count);
      Sort(readIds);

      // Remove paired reads
      allReads.RemoveByIndex(readIds);
      if (USE_TRUSTED) trusted.RemoveByIndex(readIds);

      cout << "Excluded " << readIds.size() << " paired reads" << "\n";
    } else 
      cout << "No paired reads to exclude." << "\n";
  }

  cout << "Data loaded." << endl;

  String trustedKmersFileBase = run_dir + "/" + READS_IN + ".trusted.k";
  vec<KmerShortMap*> trustedTablePtrs;
  if (USE_TRUSTED) {
    for ( unsigned int i = 0; i < Ks.size(); ++i ) {
#define CASE(_KSHAPE) \
        if ( REBUILD || ! IsRegularFile( trustedKmersFileBase + ToString(Ks[i]) ) ) \
          WriteKmerTrustedFrequencies<_KSHAPE>( allReads, trustedKmersFileBase + \
            ToString(Ks[i]), trusted); \
        trustedTablePtrs.push_back( new KmerShortMap(Ks[i], \
          trustedKmersFileBase+ToString(Ks[i]) ) )
      DISPATCH_ON_KSHAPE(Ks[i], CASE);
    }
  }

  Bool unique = False;
  String kmersFileBase = run_dir + "/" + READS_IN + ".nonunique.k";
  if (ALL_STRONG) {
    // All kmers are considered strong. Build table containing all kmers.
    unique = True;
    kmersFileBase = run_dir + "/" + READS_IN + ".strong.k";
  }
  vec<KmerFrequencyTable*> freqTablePtrs;
  for ( unsigned int i = 0; i < Ks.size(); ++i ) {
#define CASE2(_KSHAPE) \
        if ( REBUILD || ! IsRegularFile( kmersFileBase + ToString(Ks[i]) ) ) \
          WriteKmerFrequencies<_KSHAPE>( allReads, kmersFileBase + ToString(Ks[i]), unique ); \
        freqTablePtrs.push_back( new KmerFrequencyTable( Ks[i], kmersFileBase+ToString(Ks[i]) ) )
    
    DISPATCH_ON_KSHAPE(Ks[i], CASE2);
  }

  // All kmers in reads are considered strong. Nothing else to do.
  if (ALL_STRONG)
    exit(0);
  

  /*
    Gather the first local minimum of the kmer frequency histogram,
    for each (kmer shape, GC content) combination.  Store it in lowMultsByGC and 
    highMultsByGC (right now, both of these store the same value -- the kmer
    frequency corresponding to the first local minimum).
    
    For kmers with very high or very low GC content, it may be impossible to determine
    the first local minimum (because there are too few such kmers in the reads to make
    a good histogram).  For such cases, we take the closest GC value for which 
    we _could_ get a first local minimum, and use that value.
  */
  
  vec< vec<int> > lowMultsByGC( Ks.size() );
  vec< vec<int> > highMultsByGC( Ks.size() );
  
  for ( unsigned int i = 0; i < Ks.size(); ++i ) {
    int k = GetKmerSize(Ks[i]);
    // Obtain kmer frequency histograms for each GC content
    lowMultsByGC[i].resize( k+1 );
    highMultsByGC[i].resize( k+1 );
    for (int gc = 0; gc <= k; gc++) {
      int localMin = freqTablePtrs[i]->GetFirstLocalMin(1.25, (GC_BIASED ? gc : -1), 100);
      lowMultsByGC[i][gc] = highMultsByGC[i][gc] = localMin;
    }
    
    // Fill in empty values from 50% to 0% GC.
    int lastgood = lowMultsByGC[i][k/2];
    for (int gc = k/2-1; gc >= 0; gc-- )
      if ( lowMultsByGC[i][gc] < 0 )
	lowMultsByGC[i][gc] = highMultsByGC[i][gc] = lastgood;
      else
	lastgood = lowMultsByGC[i][gc];
    // Fill in empty values from 50% to 100% GC.
    lastgood = lowMultsByGC[i][k/2];
    for (int gc = k/2+1; gc <= k; gc++ )
      if ( lowMultsByGC[i][gc] < 0 )
	lowMultsByGC[i][gc] = highMultsByGC[i][gc] = lastgood;
      else
	lastgood = lowMultsByGC[i][gc];
    
    for (int gc = 0; gc < k+1; gc++)
	PRINT3( Ks[i], gc, lowMultsByGC[i][gc] );
  }
  
  // Make Strong Kmer Table
  cout << "Making strong kmer table..." << endl;
  String strongKmersFileBase = run_dir + "/" + READS_IN + ".strong.k";
  if (USE_TRUSTED) {
    for ( unsigned int i = 0; i < Ks.size(); ++i ) {
#define CASE3(_KSHAPE) \
        TransformKmerShortMap<_KSHAPE>( kmersFileBase + ToString(Ks[i]), \
          strongKmersFileBase + ToString(Ks[i]), \
	  tksmTrustedThresholdByGc(lowMultsByGC[i], *trustedTablePtrs[i]) )
      
      DISPATCH_ON_KSHAPE(Ks[i], CASE3);
    }

  } else {

    for ( unsigned int i = 0; i < Ks.size(); ++i ) {
#define CASE4(_KSHAPE) \
        TransformKmerShortMap<_KSHAPE>( kmersFileBase + ToString(Ks[i]), \
          strongKmersFileBase + ToString(Ks[i]), \
	  tksmThresholdByGc(lowMultsByGC[i]) )
      
      DISPATCH_ON_KSHAPE(Ks[i], CASE4);
    }
  }
  
  return 0;
}

