/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2005) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

/** 

\file ConstructPhredTable.cc

Read in one or more sffinfo files and corresponding references and alignments,
converting to format expected by PhredTable, and then compute 
Phred table for quality scoring.

*/

#define NDEBUG

#include "MainTools.h"
#include "Alignment.h"
#include "Basevector.h"
#include "Feudal.h"
#include "TokenizeString.h"
#include "454/qual/PhredTableWriter.h"
#include "454/qual/PhredTableReader.h"
#include "454/sff/SffTypes.h"
#include "454/sff/SffInfo.h"
#include "454/sff/SffRead.h"
#include "lookup/LookAlign.h"
#include "lookup/LookAlignSort.h"
#include "lookup/LookAlignCleanup.h"
#include "lookup/VecFromLookAlign.h"
#include "assembly/BoundAlignment.h"
#include "assembly/BasicSequence.h"
#include "FastaFileset.h"



int main( int argc, char *argv[] )
{
  RunTime();

  BeginCommandArguments;
  // The sffinfo file with the data, or a :-separated list of such
  // files. This should be the header name for a forest of binary
  // sffinfo files that have been processed and aligned by us.
  CommandArgument_VecString(SFFINFO);
  // Reference in fasta or fastb format, or a :-separated list. 
  CommandArgument_VecString(REF);
  // The alignment file corresponding to the sffinfo above, or a
  // :-separated list.  This should be a file processed with
  // readnumbers in sync with SFFINFO (that is, with
  // QUERY_NAMING=numeric).  
  CommandArgument_VecString(QLTOUT);
  // :-separated list of high quality interval files. We will only
  // use alignments completely within a HQ interval in constructing
  // the table. The list may be incomplete or have fake file names,
  // such as "none", in which case we just ignore the name.
  CommandArgument_VecString_OrDefault(CALL_ONLY, "");
  // Default is SFFINFO; output to OUT_PREFIX.phredtable
  CommandArgument_String_Abbr_OrDefault(OUT_PREFIX,O,"");
  // File with non-default predictor parameters.
  CommandArgument_String_OrDefault(PARAMS,"");
  // Print out error rates by threshold for individual predictors.
  CommandArgument_Bool_Abbr_OrDefault(PRINT_INDIVIDUAL, IND, True);

  CommandArgument_UnsignedInt_OrDefault(EXCLUDE_ENDS, 3);
  CommandArgument_UnsignedInt_OrDefault(MIN_BASES, 5000);
  CommandArgument_UnsignedInt_OrDefault(READPOS_BINS, 10);
  CommandArgument_UnsignedInt_OrDefault(FLOWALIGN_BINS, 10);
  CommandArgument_UnsignedInt_OrDefault(OVERLAP_BINS, 10);
  CommandArgument_UnsignedInt_OrDefault(HP_BINS, 10);
  CommandArgument_UnsignedInt_OrDefault(IE_BINS, 10);
  CommandArgument_UnsignedInt_OrDefault(OTHER_BINS, 10);
  EndCommandArguments;

  SffPredictorParameters predParamHandler;

  if (!PARAMS.empty()) {
    predParamHandler.SetFromFile(PARAMS);
  }

  // The SFFINFO, REF, and QLTOUT lists must be parallel
  ForceAssertEq(SFFINFO.size(), QLTOUT.size());
  ForceAssertEq(SFFINFO.size(), REF.size());

  if (SFFINFO.size()>0) ForceAssert(!OUT_PREFIX.empty());

  vecbasevector checkReads;
  vecString checkNames;

  const bool USE_OTHER = SffRead::HasOther();
  const int NUM_CAFIE = SffRead::CafieNumber();

  vec<float> incorrect, homopol, readPos, flowalign, overlap, other;
  vec<vec<float> >  cafie(NUM_CAFIE);
  //PRINT5(AlignErr::NO_BASE, AlignErr::MISMATCH, AlignErr::MATCH, AlignErr::INSERTION, AlignErr::DELETION);
  for (int i=0; i !=SFFINFO.isize(); ++i) {
    cout << "Reading in reference " << REF[i] << endl;
    vecbasevector ref;
    if (REF[i].Contains("fasta")) FastFetchReads(ref,0,REF[i]);
    else ref.ReadAll(REF[i]);

    cout << "Reading in information for SffInfo " << SFFINFO[i] << endl;
    SffInfo info;
    info.Read(SFFINFO[i]);
    vec<look_align_plus> aligns;
    vec<vec<int> > alignIndices;
    GetSortedAlignIndices(QLTOUT[i], aligns, alignIndices);

    int removed = ProcessCallOnly(i, CALL_ONLY, aligns, alignIndices);

    basevector bases;
    cout << "Processing " << info.size() << " reads in groups of 1000\n";


    for (SffRead r(info); r.Index() != alignIndices.isize(); ++r) {
      DotMod(cout, r.Index(), 1000);
      
#ifndef CHECK_AGAINST_QUALSFROMSFF
     if (alignIndices[r.Index()].size() != 1) continue;
#endif
      //AssertEq(r.Name(), checkNames[r.Index()]);
      //r.Bases().Print(cout);
      //checkReads[r.Index()].Print(cout);
      r.ClippedBases(bases);

      //calculate the correct/incorrect vector for this: fill it up with
      // NO_BASE, MISMATCH, MATCH, INSERTION or DELETION
      vec<unsigned char> readIncorr;
      vec<int> delCount;
#ifndef CHECK_AGAINST_QUALSFROMSFF
      const look_align_plus & la = aligns[alignIndices[r.Index()][0]];
      VecFromLookAlign(readIncorr, delCount, bases, ref, la, 0);
      SetDeletionsRandomly(readIncorr, delCount);
      RemoveErrorsAtEnds(readIncorr,EXCLUDE_ENDS);

      //use the readIncorrect vector to figure out which bases to add
      //to our predictors.
#endif
      const int LEFT = r.Clips().first;
      vec<float> values;
      for (int b=0; b != readIncorr.isize(); ++b) {
#ifndef CHECK_AGAINST_QUALSFROMSFF
	if (AlignErr::NO_BASE == int(readIncorr[b])) continue;
#endif
	r.GetPredictors(values, b+LEFT);
	int i=0;
	readPos.push_back(values[i++]);
	flowalign.push_back(values[i++]);
	overlap.push_back(values[i++]);
	homopol.push_back(values[i++]);
	for (int j=0; j != NUM_CAFIE; ++j) {
	  cafie[j].push_back(values[i++]);
	}
	if (USE_OTHER) other.push_back(values[5]);

	incorrect.push_back(readIncorr[b]);
#ifdef FIRST_100
	if (r.Index() >= 100) exit(0);
	cout << b +LEFT << " " << as_base(bases[b]) << " ";
	copy(values.begin(), values.end(), ostream_iterator<float>(cout," "));
	cout << "\n";
#endif
      } //end of for each base loop.
    } //end of for each read loop.
  } //end of for each info loop.

  PhredTableWriter p(MIN_BASES);


  p.addPredictor(readPos, READPOS_BINS);
  cout << "added readPos" << endl;
  p.addPredictor(flowalign, FLOWALIGN_BINS);
  cout << "added flowalign" << endl;
  p.addPredictor(overlap, OVERLAP_BINS); 
  cout << "added overlap" << endl;
  p.addDiscretePredictor(homopol, HP_BINS);
  cout << "added homopol" << endl;
  for (int j=0; j != NUM_CAFIE; ++j) {
    p.addDiscretePredictor(cafie[j], IE_BINS);  
    cout << "added cafie " << j << endl;
  }
  if (USE_OTHER) {
    p.addPredictor(other, OTHER_BINS);
    cout << "added other" << endl;
  }


  cout << "Making Phred table..." << endl;
  // Incorrect tabulates whether the base matches the reference.  
  // It uses the enum in PhredTable
  // Deletions are distributed between the previous and succeeding bases.
  p.MakePhredTable(incorrect);

  const String & table =  OUT_PREFIX + ".phredtable";

  p.WritePhredTable(table, & predParamHandler);

  if (PRINT_INDIVIDUAL) p.PrintPredErrorRates(cout);
  
  return 0;

}
