/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2006) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////
#ifndef FORCE_DEBUG
#define NDEBUG
#endif 

#include "MainTools.h"
#include "Basevector.h"
#include "Qualvector.h"
#include "PackAlign.h"
#include "lookup/LookAlignSort.h"
#include "lookup/LookAlign.h"
#include "FastaFileset.h"
#include "assembly/BoundAlignment.h"
#include "assembly/BasicSequence.h"
#include "polymorphism/BaseCountHolder.h"
#include "polymorphism/BaseCountHolder2.h"

/** Create a coverage file for bases with appropriate NQS.  \file MapNQSCoverage.cc
 **/

const char *DOC = "Create a coverage file for bases with appropriate NQS.\n\n"
"Files FASTA, QUAL and QLTOUT must be in sync, with the alignments "
"having been generated with QUERY_NAMING=numeric.  You can specify "
"multiple files for each, using StringSet format, as long as the number of files "
"is the same for all 3.  Alternatively, QUAL can be empty (NOTE it has "
"no default value!) which indicates that you wish to use 30 for all "
"qual values, bypassing typical quality-score checks.\n\n"
"We only use reads with unique alignments, where unique is defined "
"according to the parameter ALIGN_COMP.\n\n"
"The file has one line and 20-60 bytes for each covered base, so it can "
"be pretty big.  It can be summarized by CallPolymorphismsFromMap and "
"various other tools.\n\n"
"The coverage file has one block for each contig in the reference.  The format "
"is controlled by the PRINT_RAW, PRINT_ACCEPTED, and PRINT_INDELS options.  "
"By default (all of these true) the format is as shown below. " 
"Columms are tab-delimited.\n\n"
"- pos: 0-counted base on contig\n"
"- ref: reference base\n"
"- subsequent columns: \n"
"   - 8 columns for raw coverage, fw and then rc (1 col per base).\n"
"   - 8 columns for accepted coverage, fw and rc\n"
"   - total raw cov\n"
"   - total accepted cov\n"
"   - 8 columns for accepted insertion coverage, fw and rc (1 col per base).\n"
"   - total accepted insertion cov\n"
"   - 2 columns for accepted deletion coverage, fw and rc.\n"
"   - total accepted deletion cov\n\n";

/*
Example, first 3 lines of a large file:

Contig 0 size 173427:
pos     ref     A       C       G       T       rA      rC      rG      rT      aA      aC      aG      aT      arA     arC     arG     arT     cov     accept  iA      iC      iG      iT      irA    irC      irG     irT     sumI    D       rD      sumD
0       G       0       0       17      0       0       0       22      0       0       0       15      0       0       0       17      0       39      32      0       0       0       0       0      0	0       0       0       0       0       0
*/

/* TODO: For small to medium references, a large part of the time is spent
sorting the alignments. If you are running a lot of subsampling, it
would probably pay to modify the program to either accept pre-sorted
alignments, or to run many cycles of subsampling after having read and
sorted the alignments.
*/

// GetBaseCounts, FillBaseCounts, OutputBaseCounts provide necessary
// level of indirection to allow two different implementations of
// HOLDER, the base count holder.  TODO: Implement using a general
// sparse-vector class, ideally one that switches to a dense
// representation when that is more efficient.
template<class HOLDER> 
pair<longlong,longlong>  
GetBaseCounts(const vecbasevector & ref, const String & OUT_PREFIX,
	      const vec<String> & FASTA, const vec<String> & QUAL,
	      const vec<String> & QLTOUT, int MIN_QUAL, 
	      int NEIGHBOR_QUAL,
	      int NEIGHBORHOOD, int MAX_MISMATCHES, int MAX_INDELS, 
	      int EXCLUDE_END, const vec<double> &SUBSAMPLE, 
	      double ALIGN_ERR_RATE, double ALIGN_COMP,
	      bool PRINT_ACCEPTED, bool PRINT_INDELS,
	      bool PRINT_RAW, bool PRINT_ZERO_COV);

template<class V>
pair<int,int> FillBaseCounts
(const vecbasevector & ref, V & baseCounts,
 V & approvedBaseCounts, V & insBaseCounts, V & delBaseCounts,
 String FASTA, String QUAL, String QLTOUT, 
 unsigned MIN_QUAL, unsigned NEIGHBOR_QUAL, unsigned NEIGHBORHOOD, 
 unsigned MAX_MISMATCHES, unsigned MAX_INDELS, int EXCLUDE_END, 
 double SUBSAMPLE, double ALIGN_ERR_RATE, double ALIGN_COMP,
 Bool PRINT_ACCEPTED, Bool PRINT_INDELS);

template<class V>
void OutputBaseCounts
(const vecbasevector & ref, V & baseCounts,
 V & approvedBaseCounts, V & insBaseCounts, V & delBaseCounts,
 Bool PRINT_RAW, Bool PRINT_ACCEPTED, Bool PRINT_INDELS, Bool PRINT_ZERO_COV, 
 String OUT_PREFIX);

int main( int argc, char *argv[] )
{
  RunTime();

  BeginCommandArguments;
  CommandDoc(DOC);
  CommandArgument_StringSet_Doc(FASTA,"Fasta read files, StringSet format.");
  CommandArgument_StringSet_Doc(QUAL,"Fasta quality files, parallel to FASTA, StringSet format. Or empty to assume Q30 for all bases.");
  CommandArgument_StringSet_Doc(QLTOUT,"Text files of lookaligns, parallel to FASTA, StringSet format.  If QueryLookupTable was used, must have used QUERY_NAMING=numeric.");
  CommandArgument_String_Doc(REF,"Fasta or fastb file for the reference.  Must match that used for QLTOUTs.");
  CommandArgument_String_Abbr_Doc(OUT_PREFIX,O,"Coverage map is written to OUT_PREFIX.coverage_map");
  CommandArgument_UnsignedInt_Abbr_OrDefault_Doc(MIN_QUAL,Q,20,"Quality required for center base."); 
  CommandArgument_UnsignedInt_Abbr_OrDefault_Doc(NEIGHBOR_QUAL, NQ, 15, "Quality required for neighbor bases.");
  CommandArgument_UnsignedInt_Abbr_OrDefault_Doc(NEIGHBORHOOD, N, 5, "Size of neighborhood on each side of center base.");
  CommandArgument_UnsignedInt_Abbr_OrDefault_Doc(MAX_MISMATCHES,MM,2,"Allow at most this number of mismatches in the NQS window.");
  CommandArgument_UnsignedInt_Abbr_OrDefault_Doc(MAX_INDELS,MI,0,"Allow at most this many indels in the NQS window.  Default of 0 implies no indels can be called.");
  CommandArgument_Int_Abbr_OrDefault_Doc(EXCLUDE_END,EE,2,"");
  CommandArgument_Bool_OrDefault_Doc(PRINT_RAW,True,"Print raw coverage: total, and fw and rc coverage per base.");
  CommandArgument_Bool_OrDefault_Doc(PRINT_ACCEPTED,True,"Print accepted coverage as defined by other params.");
  CommandArgument_Bool_OrDefault_Doc(PRINT_INDELS,True,"Also output information about indels. Applies all ins and all dels to previous base using previous base's quality. There are separate columns for insertions and deletions after the regular columns.");
  CommandArgument_Bool_OrDefault_Doc(PRINT_ZERO_COV,True,"Print a line for reference bases with no coverage.");
  CommandArgument_DoubleSet_OrDefault_Doc(SUBSAMPLE,"1.0","If less than 1, use a random subset of reads of size SUBSAMPLE * total number of reads.");
  CommandArgument_Double_OrDefault_Doc(ALIGN_ERR_RATE,0.081,"Reject alignments with a greater error rate than this.");
  CommandArgument_Double_OrDefault_Doc(ALIGN_COMP,4.0,"For an alignment to be considered unique, the second best alignment must have more than ALIGN_COMP * the error rate of the best one. If the best alignment is perfect, it is considered to have one error for this purpose.");
  CommandArgument_Int_OrDefault_Doc(SEED,0,"Seed for the random number generator for subsampling.");
  CommandArgument_Bool_OrDefault_Doc(USE_MAP,False,"If true, use a sparse vector to hold the coverage map--more efficient for a large, sparsely covered reference, but less efficient for a small densely covered one.  This is the default behavior if the reference file is larger than 10 Mb (but see FULLY_COVERED).");
  CommandArgument_Bool_OrDefault_Doc(FULLY_COVERED,False,"When the genome size is > 10 Mb, the default becomes USE_MAP==True, without regard to the coverage.  This option lets you pick the dense-vector implementation, which is more space-efficient for fully-covered genomes.");

  EndCommandArguments;

  ForceAssertEq(FASTA.size(), QLTOUT.size());
  if (!QUAL.empty())
    ForceAssertEq(QUAL.size(), FASTA.size());
  // We replicate SUBSAMPLE argument for all the FASTAs if only one was given
  if (1==SUBSAMPLE.size()) {
    while(SUBSAMPLE.size()<FASTA.size())
      SUBSAMPLE.push_back(SUBSAMPLE[0]);
  }
  ForceAssertEq(SUBSAMPLE.size(), FASTA.size());
  
  srand48(SEED);

  vecbasevector ref;
  cout << "Loading reference data from " << REF << endl;
  if (REF.Contains("fastb")) {
    ref.ReadAll(REF);
  } else {
    FastFetchReads( ref, 0, REF );
  }
  cout << "Read " << ref.size() << " reference contigs." << endl;

  //each base in the reference will use up 4 * 8 * sizeof(unsigned
  //short) = 64 bytes if we use BaseCountHolder, whether it is covered
  //or not.  we assume the machine can easily give us 640 MB of
  //memory, and if the dense vector would take more then we use
  //BaseCountHolder2 unless we need to use the dense vector.
  vec<int> sizes;
  ref.ElementSizes(sizes);
  longlong totalBases = 0;
  totalBases = accumulate(sizes.begin(), sizes.end(), totalBases);
  PRINT(totalBases);
  if (totalBases > 10000000 && !FULLY_COVERED) USE_MAP=True;

  pair<longlong, longlong> all;
  if (USE_MAP) {
    all = GetBaseCounts<BaseCountHolder2>
      (ref, OUT_PREFIX,
       FASTA, QUAL, QLTOUT,
       MIN_QUAL, NEIGHBOR_QUAL, NEIGHBORHOOD, 
       MAX_MISMATCHES, MAX_INDELS, EXCLUDE_END, 
       SUBSAMPLE, ALIGN_ERR_RATE, ALIGN_COMP,
       PRINT_ACCEPTED, PRINT_INDELS, 
       PRINT_RAW, PRINT_ZERO_COV  );
  } 
  else {
    all = GetBaseCounts<BaseCountHolder>
      (ref, OUT_PREFIX,
       FASTA, QUAL, QLTOUT,
       MIN_QUAL, NEIGHBOR_QUAL, NEIGHBORHOOD, 
       MAX_MISMATCHES, MAX_INDELS, EXCLUDE_END, 
       SUBSAMPLE, ALIGN_ERR_RATE, ALIGN_COMP,
       PRINT_ACCEPTED, PRINT_INDELS, 
       PRINT_RAW, PRINT_ZERO_COV  );
  }

  longlong allbases = all.first, allapp = all.second;
  PRINT2(allbases, allapp);

  return 0;
}

template<class HOLDER> 
pair<longlong,longlong>  
GetBaseCounts(const vecbasevector & ref, const String & OUT_PREFIX,
	      const vec<String> & FASTA, const vec<String> & QUAL,
	      const vec<String> & QLTOUT, int MIN_QUAL, int NEIGHBOR_QUAL,
	      int NEIGHBORHOOD, int MAX_MISMATCHES, int MAX_INDELS, 
	      int EXCLUDE_END, const vec<double> &SUBSAMPLE, 
	      double ALIGN_ERR_RATE, double ALIGN_COMP,
	      bool PRINT_ACCEPTED, bool PRINT_INDELS,
	      bool PRINT_RAW, bool PRINT_ZERO_COV){ 

  longlong allbases = 0, allapp = 0;
  pair<int,int> all;

  vec<HOLDER> baseCounts, approvedBaseCounts, 
    insBaseCounts, delBaseCounts;
  for (int i=0; i<ref.size(); ++i) {
    baseCounts.push_back(HOLDER(ref[i].size()));
  }
  insBaseCounts = delBaseCounts = approvedBaseCounts = baseCounts;

  for (int i=0; i<FASTA.isize(); ++i) {
    all = FillBaseCounts(ref, baseCounts,
			 approvedBaseCounts, insBaseCounts, delBaseCounts,
			 FASTA[i],
			 (QUAL.empty() ? String() : QUAL[i]), QLTOUT[i],
			 MIN_QUAL, NEIGHBOR_QUAL, NEIGHBORHOOD, 
			 MAX_MISMATCHES, MAX_INDELS, EXCLUDE_END, 
			 SUBSAMPLE[i], ALIGN_ERR_RATE, ALIGN_COMP,
			 PRINT_ACCEPTED, PRINT_INDELS );
    allbases += all.first;
    allapp += all.second;
  }
  OutputBaseCounts(ref, baseCounts,
		   approvedBaseCounts, insBaseCounts,delBaseCounts,
		   PRINT_RAW, PRINT_ACCEPTED, PRINT_INDELS, PRINT_ZERO_COV, 
		   OUT_PREFIX);
  return make_pair(allbases, allapp);
}


void CountErrors(BasicBoundAlignment &ba, vec<unsigned int> &indels, 
		 vec<unsigned int> &mismatches) {
  const int N = indels.size();
  for (int j=0; j<N; ++j) {
    indels[j]=0;
    mismatches[j]=0;
  }
  int pos;
  for ( BasicBoundAlignment::Iterator i(&ba); !i.IsAtEnd(); ++i) {
    pos = ba.GetFirstOrientation( )==orient_RC 
      ? N - i.GetOffsetOnFirst() -1 
      : i.GetOffsetOnFirst();
    if (i.IsGap()) {
      indels[pos]=1;
    } else if (!i.IsMatch()) {
      mismatches[pos]=1;
    }
  }
}
 
template<class V>
pair<int,int> FillBaseCounts
(const vecbasevector & ref, V & baseCounts,
 V & approvedBaseCounts, V & insBaseCounts, V & delBaseCounts,
 String FASTA, String QUAL, String QLTOUT, 
 unsigned MIN_QUAL, unsigned NEIGHBOR_QUAL, unsigned NEIGHBORHOOD, 
 unsigned MAX_MISMATCHES, unsigned MAX_INDELS, int EXCLUDE_END, 
 double SUBSAMPLE, double ALIGN_ERR_RATE, double ALIGN_COMP,
 Bool PRINT_ACCEPTED, Bool PRINT_INDELS) {

  vecbasevector reads;
  vecqualvector quals;
  vec<look_align_plus> aligns;
  vec<vec<int> > alignIndices;

  cout << "Loading reads from " << FASTA << endl;
  if (FASTA.Contains(".fastb")) {
    reads.ReadAll(FASTA);
  } else {
    FastFetchReads(reads, 0, FASTA);
  }
  cout << "Read " << reads.size() << " reads." << endl;

  if (QUAL.empty()) {
    quals.Reserve(reads.size(), reads.rawsize());
    for (int i=0; i != reads.size(); ++i) {
      quals.push_back_reserve(qualvector(reads[i].size(), 30));
    }
  } else {
    cout << "Loading quals from " << QUAL << endl;
    if (QUAL.Contains(".qualb")) quals.ReadAll(QUAL);
    else FastFetchQuals(quals, 0, QUAL);
  }

  ForceAssertEq(reads.size(), quals.size());
  for (int i=0; i != reads.size(); ++i) {
    if (int(reads[i].size()) !=  int(quals[i].size())) {
      FatalErr("Different sizes for read " << i << ": fasta is "
	       << reads[i].size() << ", quals are: "
	       << quals[i].size());
    }
  }

  cout << "Loading alignments from " << QLTOUT << endl;
  GetSortedAlignIndices(QLTOUT, aligns, alignIndices, 0, false);
  cout << "Read " << aligns.size() << " alignments." << endl;

  cout << "Processing " << (999+alignIndices.size()) /1000 
       << " blocks of 1000 reads\n";
  longlong allbases = 0, allapp = 0;
  int goodReads = 0, ambigReads = 0, badReads = 0, unalignedReads = 0;
  qualvector minNeighborhoodQual;
  vec<unsigned int> indels, winIndels, mismatches, winMismatches;
  int readlen;
  for ( int read = 0; read < alignIndices.isize( ); ++read ) {
    DotMod(cout, read, 1000);
    if (reads[read].empty()) continue;
    //use only reads with unique good alignments.
    if (! ApproveUniqueLookAligns(alignIndices, aligns, read, 
				  ALIGN_ERR_RATE, ALIGN_COMP,
				  &unalignedReads, &badReads, &ambigReads)) {
      continue;
    }
    if (1.0 != SUBSAMPLE && drand48() > SUBSAMPLE) continue;
    ++goodReads;
    
    const look_align & la = aligns[alignIndices[read][0]];
    AssertEq(read, la.query_id);
    BasicBoundAlignment ba(BasicSequence(&(reads[read]), read),
			   (la.rc1 ? orient_RC : orient_FW), 
			   BasicSequence(&(ref[la.target_id]), la.target_id), 
			   orient_FW,
			   la.a);

    readlen = quals[read].size();
    // Determine min quality in window 
    minNeighborhoodQual.resize(readlen);
    MinWindow(quals[read].begin(), quals[read].end(), minNeighborhoodQual.begin(), NEIGHBORHOOD);

    // Determine # indels and mismatches in window 
    indels.resize(readlen);
    mismatches.resize(readlen);
    CountErrors(ba, indels, mismatches);
    winIndels.resize(readlen);
    SumWindow(indels.begin(), indels.end(), winIndels.begin(), NEIGHBORHOOD);
    winMismatches.resize(readlen);
    SumWindow(mismatches.begin(), mismatches.end(), winMismatches.begin(), NEIGHBORHOOD);

    // Walk through the alignment and check bases.
    for ( BasicBoundAlignment::Iterator i(&ba); !i.IsAtEnd(); ++i) {
      int qpos = la.rc1 
	? readlen - i.GetOffsetOnFirst() -1
	: i.GetOffsetOnFirst();
      if (!i.IsGap()) {
	++allbases;
	baseCounts[la.target_id].Increment(i.GetOffsetOnSecond(), i.GetBaseOnFirst(), la.rc1); 
      }
      if (MIN_QUAL <= quals[read][qpos]
	  && NEIGHBOR_QUAL <= minNeighborhoodQual[qpos]
	  && winMismatches[qpos] <= MAX_MISMATCHES
	  && winIndels[qpos] <= MAX_INDELS
	  && qpos >= EXCLUDE_END 
	  && (readlen-qpos-1)>=EXCLUDE_END
	  )  {
	if (!i.IsGap() && PRINT_ACCEPTED)  {
	  approvedBaseCounts[la.target_id]
	    .Increment(i.GetOffsetOnSecond(), i.GetBaseOnFirst(), la.rc1);
	  ++allapp;
	}
	else if (i.IsGapOnFirst() && PRINT_INDELS) {
	  delBaseCounts[la.target_id]
	    .Increment(i.GetOffsetOnSecond(), 0, la.rc1);
	}
	else if (i.IsGapOnSecond() && PRINT_INDELS) {
	  insBaseCounts[la.target_id]
	    .Increment(i.GetOffsetOnSecond(), i.GetBaseOnFirst(), la.rc1);
	}
      } //end of if this part of the alignment is approved.
    } // loop through alignment
  } // loop over reads with unique alignments

  cout << "\n";
  PRINT4(goodReads, ambigReads, badReads, unalignedReads);
  return make_pair(allbases, allapp);
}

template<class V>
void OutputBaseCounts
(const vecbasevector & ref, V & baseCounts,
 V & approvedBaseCounts, V & insBaseCounts, V & delBaseCounts,
 Bool PRINT_RAW, Bool PRINT_ACCEPTED, Bool PRINT_INDELS, Bool PRINT_ZERO_COV, 
 String OUT_PREFIX) {
  String outname = OUT_PREFIX+".coverage_map";
  cout << "Writing coverage map to " << outname << endl;
  Ofstream(out, outname);
  for (int contig=0; contig!=ref.size(); ++contig) {
    out << "Contig " << contig << " size " << ref[contig].size() << ":\n";
    out << "pos\tref\t";
    if (PRINT_RAW)
      out << "A\tC\tG\tT\trA\trC\trG\trT\t";
    if (PRINT_ACCEPTED) 
      out << "aA\taC\taG\taT\tarA\tarC\tarG\tarT\t";
    if (PRINT_RAW)
      out << "cov\t";
    if (PRINT_ACCEPTED) 
      out << "accept\t";
    if (PRINT_INDELS) 
      out << "iA\tiC\tiG\tiT\tirA\tirC\tirG\tirT\tsumI\tD\trD\tsumD\t";
    out << "\n";
    for (int p=0; p != ref[contig].isize(); ++p) {
      if (PRINT_ZERO_COV || baseCounts[contig].Sum(p)>0) {
	out << p << "\t" << as_base(ref[contig][p]) << "\t";
	if (PRINT_RAW)
	  baseCounts[contig].PrintLine(out, p);
	if (PRINT_ACCEPTED) 
	  approvedBaseCounts[contig].PrintLine(out, p);
	if (PRINT_RAW)
	  out << baseCounts[contig].Sum(p) << "\t";
	if (PRINT_ACCEPTED)
	  out << approvedBaseCounts[contig].Sum(p) << "\t";
	if (PRINT_INDELS) {
	  insBaseCounts[contig].PrintLine(out, p);
	  out << insBaseCounts[contig].Sum(p) << "\t";
	  pair<int,int> dels = delBaseCounts[contig].FwRc(p);
	  out << dels.first << "\t" << dels.second << "\t"
	      << dels.first + dels.second << "\t";
	}
	out << "\n";
      }
    }
  }

}
