/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2006) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

/// Count how often errors occur for different levels of quality
/// \file EvaluateQuals.cc
///
/// Produces five columns: qual score range, count, #errors, %errors, and 
/// observed quality score.
/// If PERFECT_SOLEXA=True, print out some info about "perfect" reads (qual 
/// average > 29.9.
///
/// if NAMES_454=True, READS must be a fasta file, the read names must be
/// integers, and the alignments must have been generated 
/// with QUERY_NAMING=True (as in regular 454 processing. 
/// Otherwise, READS must be a fastb file, and the alignments must not have
/// been generated with QUERY_NAMING=True.
///
/// BIDIR indicates whether to use reads aligning in both directions
/// If !BIDIR, DIR indicates whether to use forward (0) or rc (1) reads
/// 
/// MISMATCHES, UNDERCALLS and OVERCALLS (all default to True): allow the
/// user to choose which types of errors to look at.




#include "MainTools.h"
#include "Basevector.h"
#include "Qualvector.h"
#include "lookup/LookAlign.h"
#include "lookup/LookAlignFinder.h"
#include "lookup/LookAlignCleanup.h"
#include "util/QualInfo.h"   
#include "math/Functions.h"   
#include "math/HoInterval.h"   
#include "FastaFileset.h"


/// Print to out information about "perfect" reads with minAligns <= number of 
/// alignments < maxAligns.
void PrintPerfect(ostream & out, const vec<look_align_plus> & aligns,
		  const vec<vec<int> > & alignIndices, 
		  const vecqualvector & quals,
		  const vecbasevector & reads,
		  int minAligns, int maxAligns);

int main( int argc, char *argv[])
{
  RunTime();

  BeginCommandArguments;
  CommandArgument_VecString(READS);
  CommandArgument_VecString(REF);
  CommandArgument_VecString(ALIGNS);
  CommandArgument_VecString(QUALS);

  // :-separated list of high quality interval files. We will only
  // use alignments completely within a HQ interval in constructing
  // the table. The list may be incomplete or have fake file names,
  // such as "none", in which case we just ignore the name.
  CommandArgument_VecString_OrDefault(CALL_ONLY, "");

  CommandArgument_String_OrDefault(DATAFILE,"");
  CommandArgument_Int_OrDefault(BINSIZE,1);
  CommandArgument_UnsignedInt_OrDefault(EXCLUDE_ENDS,3);
  CommandArgument_Bool_OrDefault(NAMES_454,False);
  CommandArgument_Bool_OrDefault(BIDIR,True);
  CommandArgument_Bool_OrDefault(BY_ERRORS,False);
  CommandArgument_UnsignedInt_OrDefault(DIR,0);
  CommandArgument_UnsignedInt_OrDefault(MIN_QUAL,0);
  CommandArgument_UnsignedInt_OrDefault(MAX_QUAL,40);
  CommandArgument_Bool_Abbr_OrDefault(MISMATCHES, M, True);
  CommandArgument_Bool_Abbr_OrDefault(UNDERCALLS, U, True);
  CommandArgument_Bool_Abbr_OrDefault(OVERCALLS, O, True);
  EndCommandArguments;

  QualInfo info1, info2, info3, info;
  QualInfo *infoByErrors[] = {&info1, &info1, &info2, &info3};
  const int maxErrors = sizeof(infoByErrors)/sizeof(QualInfo*) - 1;
  longlong totalReads = 0; 
  longlong readcount=0; // reads used in evaluation.
  longlong removed = 0; // reads removed because bad area of reference.
  longlong nonunique = 0; //reads with non-unique alignments.

  for (int file=0; file != READS.isize(); ++file) {

    cout << "Reading in reads, quals and reference for "
	 << READS[file] << "... " << endl;
    vecbasevector reads, ref;
    vecString names;

    if (NAMES_454 || READS[file].Contains("fasta") ) {
      FastFetchReads(reads, &names, READS[file]);
    } else {
      reads.ReadAll(READS[file]);
    }

    totalReads += reads.size();

    if (REF[file].Contains("fasta") ) FastFetchReads(ref, 0, REF[file]);
    else ref.ReadAll(REF[file]);

    vecqualvector quals;
    if (QUALS[file].Contains("qualb")) {
      quals.ReadAll(QUALS[file]);
    } else {
      FastFetchQuals(quals, 0, QUALS[file]);
    }

    QualInfo::errType goodErrors = 
      QualInfo::errType (QualInfo::MIS * MISMATCHES | 
			 QualInfo::INS * OVERCALLS | 
			 QualInfo::DEL * UNDERCALLS) ;


    vec<HoIntervalWithId> valid;
    if (!CALL_ONLY.empty() && !CALL_ONLY[file].empty()) {
      Ifstream(callOnlyStream, CALL_ONLY[file]);
      valid.ReadFromTextStream(callOnlyStream);
    }
   
    const int dotter = 10000;
    cout << "Reading info from alignments in groups of "
	 << dotter << "... " << endl;
    int processed=0;

    for (LookAlignFinder finder(ALIGNS[file]); !finder.empty(); ++finder) {
      DotMod(cout, ++processed, dotter);

      int i = finder.QueryId();
      vec<look_align_plus> & aligns = finder.Aligns();
      if (aligns.size() != 1) {
        ++nonunique;
	continue;
      }
      if (!valid.empty() && RejectAlign (valid, aligns[0]) ) { 
	++removed; 
	continue;
      }

      const look_align_plus & la = aligns[0];
      if (BIDIR || DIR==la.rc1) {
	++readcount;
	basevector & read = reads[i];
	qualvector & qual = quals[i];
	const basevector & thisref = ref[la.target_id];
	info.AddAlignment(la, read, qual, thisref, goodErrors, EXCLUDE_ENDS);
	if (BY_ERRORS) {
	  infoByErrors[max(maxErrors, la.Errors())]->
	    AddAlignment(la, read, qual, thisref, goodErrors, EXCLUDE_ENDS);
	}
      }
    }
    cout << endl;

  } // end of for each file loop.

  if (BY_ERRORS) {
    cout << "\nFor reads with 0-1 errors:\n";
    info1.PrintBins(cout, BINSIZE, MIN_QUAL, MAX_QUAL);
    cout << "\nFor reads with 2 errors:\n";
    info2.PrintBins(cout, BINSIZE, MIN_QUAL, MAX_QUAL);
    cout << "\nFor reads with 3 or more errors:\n";
    info3.PrintBins(cout, BINSIZE, MIN_QUAL, MAX_QUAL);
  }

  cout << "\nFor all reads:\n";
  cout << info.BinHeaders();
  info.PrintBins(cout, BINSIZE, MIN_QUAL, MAX_QUAL);

  cout << "\n" << removed << " uniquely aligned reads removed because the "
    "best alignment\nwas not in a high quality region of the reference.\n\n";

  PRINT4(totalReads, readcount, nonunique, removed);

  cout << "R2ideal " << info.R2ideal(MIN_QUAL, MAX_QUAL+1) 
       << " >q30 " << info.ActualQGreater(30) << " ";
  info.PrintErrors(cout);

  if (!DATAFILE.empty()) {
    Ofstream(os, DATAFILE);
    info.DataForGraph(os, MIN_QUAL, MAX_QUAL);
  }

  return 0;
}
