/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2006) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////


/** 
\file CallPolymorphismsFromMap.cc Process coverage_map file as output by MapNQSCoverage.
*/

const char * DOC =
"Process coverage_map file as output by MapNQSCoverage.  Call variants (SNPs and indels) "
"that seem valid given the options specified.  For each variant, output short summary "
"(contig, position, refbase, newbase) followed by human-readable presentation of the "
"entire coverage_map line giving rise to that call.  Also print various summaries at end.";

/*
Arguments:
- IN: input coverage file
- TRUTH (default ""): file is formatted as list of PolyCall's. 
- TRUTH_ONLY (default False): look only at known calls in TRUTH file. 
- CALL_ONLY (default ""): look only at these intervals
- DO_NOT_CALL (default ""): ignore these intervals, overrides CALL_ONLY
- VARIANTS (default 0): The number of variants present, e.g. 1 for haploid, 2 for diploid, 2n for n pooled diploid individuals, or 0 if unknown.
- NEED_RC (default True);
- MIN_READS (default 2);
- RC_OR_MIN_READS (default False): if true, accept piles that fulfill either 
the minReads or Rc criterion, but do not require both.
- MIN_RATIO (default 0): If improbable ratio of this allele to all alleles 
observed, reject this.
- OUTPUT_SNPS (default True): Whether to output the SNPs called
- OUTPUT_REF (default empty): if present, save a vecbitvector indicating
whether the reference was called at this base.
.
*/

#ifndef FORCE_DEBUG
#define NDEBUG
#endif

#include "MainTools.h"
#include "TokenizeString.h" 
#include "SeqInterval.h"
#include "CoverageAnalyzer.h"
#include "Basevector.h"
#include "math/HoInterval.h"
#include "polymorphism/BaseMapInfo.h"
#include "polymorphism/CoverageMapIterator.h"
#include "polymorphism/PolyCall.h"
#include "Bitvector.h"
#include "FastaFileset.h"

struct CallBases {
  Bool OUTPUT_SNPS, NEED_RC;
  unsigned int MIN_READS;
  double MIN_RATIO;
  Bool SUMMARY;
  unsigned int SUMMARY_WINDOW;
  int contig, nCalled, nRef, nSNP[2], nIns[2], nDel[2], nBases;
  vec<int> contigSize, contigCallable, contigCalls;
  vec< vec<int> > windowSize, windowCallable, windowCalls;
  CallBases(Bool OUTPUT_SNPS, Bool NEED_RC, unsigned int MIN_READS, double MIN_RATIO,
	    Bool SUMMARY, unsigned int SUMMARY_WINDOW) :
    OUTPUT_SNPS(OUTPUT_SNPS), NEED_RC(NEED_RC), MIN_READS(MIN_READS), MIN_RATIO(MIN_RATIO),
    SUMMARY(SUMMARY), SUMMARY_WINDOW(SUMMARY_WINDOW),
    contig(0), nCalled(0), nRef(0)
  { 
    nSNP[0] = nSNP[1] = nIns[0] = nIns[1] = nDel[0] = nDel[1] = nBases = 0; 
  }

  void setContig(int c, int nbases)
  {
    contig = c;
    if (SUMMARY && c>=contigCallable.isize()) {
      contigSize.resize(c+1, 0);
      contigSize[c] = nbases;
      contigCallable.resize(c+1, 0);
      contigCalls.resize(c+1, 0);
      if (SUMMARY_WINDOW>0) {
	windowSize.resize(c+1);
	windowCallable.resize(c+1);
	windowCalls.resize(c+1);
      }
    }
    if (SUMMARY && SUMMARY_WINDOW>0) {
      const int nw = (nbases + SUMMARY_WINDOW - 1) / SUMMARY_WINDOW;
      windowCallable[c].resize(nw,0);
      windowCalls[c].resize(nw,0);
      windowSize[c].resize(nw,0);
      int lower, upper;
      for (int j=0; j<nw; ++j) {
	lower = j*SUMMARY_WINDOW;
	upper = min(nbases, (1+j)*int(SUMMARY_WINDOW));
	windowSize[c][j] = upper-lower;
      }
    }
  }

  void outSNP(const BaseMapInfo &x, Bool insert, char newBase, double percentcalled,
	      bool inTruth = false)
  {
    if (OUTPUT_SNPS)
      cout << contig << " " << x.pos << " " 
	   << (insert ? '-' : as_base(x.ref)) << " " << newBase
	   << "\t" << setprecision(3) << setw(4) << percentcalled << "%\t" 
	   << x << "\t" << (inTruth ? 'T' : 'F') << "\n";
  }

  void operator()(const BaseMapInfo &x, bool inTruth = false, unsigned char newBase = 0,
		  vecbitvector * refTrue = 0)
  {
    ++nBases;
    bool called = false, snpcalled = false, notrefcalled = false;
    for (int i=0; i<4; ++i) {
      if (x.Called(i, NEED_RC, MIN_READS, MIN_RATIO) ) {
	called = true;
	if (x.ref==i) {
	  ++nRef;
	  if (refTrue) ((*refTrue)[contig].Set(x.pos,1));
	} else {
	  if (!snpcalled) 
	    ++nSNP[inTruth]; // only count one SNP call per position
	  snpcalled = notrefcalled = true;
	  outSNP(x, false, as_base(i), x.PercentCalled(i), inTruth && i==newBase );
	}
      }
      if (x.InsCalled(i, NEED_RC, MIN_READS, MIN_RATIO) ) {
	called = notrefcalled = true;
	++nIns[inTruth];
	outSNP(x, true, as_base(i), x.PercentInsCalled(i), false);
      }
    }
    if (x.DelCalled(NEED_RC, MIN_READS, MIN_RATIO) ) {
      called = notrefcalled = true;
      ++nDel[inTruth];
      outSNP(x, false, '-', x.PercentDelCalled(), inTruth && 4==newBase);
    }
    if (called) {
      ++nCalled;
      if (SUMMARY) {
	++contigCallable[contig];
	if (SUMMARY_WINDOW) {
	  ++windowCallable[contig][x.pos/SUMMARY_WINDOW];
	}
      }
    }
    if (snpcalled && SUMMARY) {
      ++contigCalls[contig];
      if (SUMMARY_WINDOW) {
	++windowCalls[contig][x.pos/SUMMARY_WINDOW];
      }
    }
    if (notrefcalled && refTrue) (*refTrue)[contig].Set(x.pos,0);
  }
};

void WriteSummary(ostream &out, const CallBases &x)
{
  const double window_for_rate = (x.SUMMARY_WINDOW >0 ? x.SUMMARY_WINDOW : 1000);
  out << "SUMMARY\tContig\t#Bases\t#Callable\tCall rate\t#SNPs\tSNP rate per " << window_for_rate << " bases called\n";
  for (int i=0; i<x.contigCallable.isize(); ++i) {
    out << "SUMMARY\t" << i << "\t" << x.contigSize[i] << "\t"
	<< x.contigCallable[i] << "\t";
    if (0==x.contigSize[i])
      out << 0;
    else
      out << setprecision(2) << setw(6) << double(x.contigCallable[i])/x.contigSize[i];
    out << "\t" << x.contigCalls[i] << "\t"; 
    if (0==x.contigCalls[i]) {
      out << "0";
    } else {
      out << setprecision(2) << setw(6)
	  << ((window_for_rate * x.contigCalls[i]) / x.contigCallable[i]);
    }
    out << "\n";
  }

  if (x.SUMMARY_WINDOW>0) {
    out << "WINDOW\tWindow\tWindow size\tCall rate per " << x.SUMMARY_WINDOW
	<< "bases\tSNP rate per " << x.SUMMARY_WINDOW << " bases called\n";
    for (int i=0; i<x.contigCallable.isize(); ++i) {
      const int nw = (x.windowSize[i].size());
      if (nw <= 2) { // More or less repeat above info
	out << "WINDOW\t" << i << ".0\t" << x.contigSize[i] << "\t";
	if (0==x.contigSize[i])
	  out << 0;
	else
	  out << setprecision(4) << setw(8)
	      << (double(x.SUMMARY_WINDOW)*x.contigCallable[i])/x.contigSize[i];
	out << "\t";
	if (0==x.contigCalls[i]) {
	  out << "0";
	} else {
	  out << setprecision(4) << setw(8)
	      << (double(x.SUMMARY_WINDOW) * x.contigCalls[i] / x.contigCallable[i]);
	}
	out << "\n";
      } else { // Actually need to window the SNPs
	for (int j=0; j<nw; ++j) {
	  out << "WINDOW\t" << setprecision(5) << setw(6)
	      << (double(i) + double(j*x.SUMMARY_WINDOW)/x.contigSize[i])
	      << "\t" << x.windowSize[i][j] << "\t"
	      << setprecision(4) << setw(8) 
	      << (double(x.SUMMARY_WINDOW)*x.windowCallable[i][j])/x.windowSize[i][j] << "\t";
	  if (0==x.windowCallable[i][j])
	    out << 0;
	  else 
	    out << setprecision(4) << setw(8)
		<< ((double(x.SUMMARY_WINDOW) * x.windowCalls[i][j]) / x.windowCallable[i][j]);
	  out << "\n";
	}
      }
    }
  }
}


ostream & operator<<(ostream &out, const CallBases &x)
{
  out << "nCalled = " << x.nCalled << ", nRef = " << x.nRef 
      << ", nSNP = [" << x.nSNP[0] << "," <<  x.nSNP[1]
      << "], nIns = [" << x.nIns[0] << "," <<  x.nIns[1] 
      << "], nDel = [" << x.nDel[0] << "," <<  x.nDel[1] << "]";
  return out; 
}

void WriteSensSpecReport(ostream &out, const CallBases &x)
{
  int wrong = x.nSNP[0], accept = x.nCalled; 
  double rate = double(wrong)/double(accept);
  double estrate = EstProbability(wrong, accept);
  pair<double,double> rateint = BinomialConfidenceInterval(wrong, accept);
  double minrate = rateint.first, maxrate = rateint.second;
  
  out << "nBases\t" << x.nBases << "\tnRef\t" << x.nRef 
      << "\tnSNP\t" << x.nSNP[0] << "\tsens\t"
      << PERCENT_RATIO(3, accept, x.nBases) << "\tfp/Mb\t" << (1.0e6*rate) << "\t"
      << (1.0e6 * minrate) << "\t" << (1.0e6 * estrate) << "\t" << (1.0e6 * maxrate)
      << "\tQ\t" << -10.0 * log10(maxrate) << "\t" << -10.0 * log10(estrate)
      << "\t" << -10.0 * log10(minrate) << endl;
}
 
void WriteSensSpecReport(ostream &out, const CallBases &x, vec<PolyCall> &truth)
{
  int right = x.nSNP[1], ntrue = truth.isize();
  int wrong = x.nSNP[0], accept = x.nCalled; 
  double rate = double(wrong)/double(accept);
  double estrate = EstProbability(wrong, accept);
  pair<double,double> rateint = BinomialConfidenceInterval(wrong, accept);
  double minrate = rateint.first, maxrate = rateint.second;
  
  out << "ntrue\t" << ntrue << "\tright\t" << right
      << "\twrong\t" << wrong << "\tsens\t"
      << PERCENT_RATIO(3, right, ntrue) << "\tfp/Mb\t" << (1.0e6*rate) << "\t"
      << (1.0e6 * minrate) << "\t" << (1.0e6 * estrate) << "\t" << (1.0e6 * maxrate)
      << "\tQ\t" << -10.0 * log10(maxrate) << "\t" << -10.0 * log10(estrate)
      << "\t" << -10.0 * log10(minrate) << endl;
}

/// Fill this in later
void RemoveSecondFromFirst(vec<HoIntervalWithId> & first, 
			   const vec<HoIntervalWithId> & second) {}

int main( int argc, char *argv[] )
{
  RunTime();

  BeginCommandArguments;
  CommandDoc(DOC);
  /// Filename arguments
  CommandArgument_String_Doc(IN,"Input coverage_map file."); 

  /// Calling parameters 

  CommandArgument_Bool_OrDefault_Doc(NEED_RC, True,"Whether to require bidirectional reads for SNP calls.");
  CommandArgument_UnsignedInt_OrDefault_Doc(MIN_READS, 2,"Number of reads to require for SNP calls.");
  // If improbable ratio of this allele to all alleles observed, reject call
  CommandArgument_Double_OrDefault_Doc(MIN_RATIO, 0,"Control SNP calls by setting the minimum ratio of # accepted reads for this allele to # accepted reads for all alleles.  E.g. 0.66 for haploid data and 0.1 for diploid data worked well for 454 reads.");

  /// Where to call SNPs.
  // The lists are both formatted as vectors of HoIntervalWithId's.
  CommandArgument_String_OrDefault_Doc(CALL_ONLY,"","If given, specifies a file containing the only regions in which this program should call SNPs, in 3-column format: contig start_base one_past_stop_base."); 
  CommandArgument_String_OrDefault_Doc(DO_NOT_CALL,"","If given, specifies a file containing regions in which this program should not call SNPs, in 3-column format as for CALL_ONLY.  Where the two lists overlap, this one wins."); 


  /// Output arguments
  CommandArgument_Bool_OrDefault_Doc(OUTPUT_SNPS, True,"Whether to write to stdout the SNPs called.");
  CommandArgument_Bool_OrDefault_Doc(OUTPUT_SENS_SPEC, False,"Whether to write to stdout a report of sensitivity and specificity from the SNPs and reference bases called, assuming that the reference is perfect (or else has true SNPs in TRUTH, if given).");
  CommandArgument_Bool_OrDefault_Doc(SUMMARY, True,"Whether to write to stdout a summary of SNP rate by contig.");
  CommandArgument_UnsignedInt_OrDefault_Doc(SUMMARY_WINDOW, 0,"If positive, write to stdout a summary of SNP rate in windows of this size.");
  CommandArgument_String_OrDefault_Doc(OUTPUT_REF, "","If given, contains a filename.  Write vecbitvector reflecting whether reference was called.");
  CommandArgument_String_OrDefault_Doc(REF, "","Needed if OUTPUT_REF is true.  Filename of reference, fasta or fastb.");

  // TRUTH file, if given, is formatted as list of PolyCall's.  
  CommandArgument_String_OrDefault_Doc(TRUTH,"","If given, specifies a file containing true SNPs for this data set, in 4-columnn format (contig position ref_base true_base).  Mark called SNPs as T or F.  If OUTPUT_SENS_SPEC is set, check this list to evaluate specificity."); 
  CommandArgument_Bool_OrDefault_Doc(TRUTH_ONLY,False,"If set, call SNPs only at positions in TRUTH set."); 

  EndCommandArguments;

  vecbitvector * reftrue = 0;
  if (!OUTPUT_REF.empty()) {
    reftrue = new vecbitvector();
    vecbasevector ref;
    ForceAssert(!REF.empty());
    if (REF.Contains("fastb", -1)) ref.ReadAll(REF);
    else FastFetchReads(ref, 0, REF);
    for (int i=0; i !=ref.size(); ++i) {
      reftrue->push_back_reserve(bitvector(ref[i].size(),0));
    }
  }

  vec<HoIntervalWithId> callOnly;
  if (!CALL_ONLY.empty()) {
    Ifstream(is, CALL_ONLY);
    callOnly.ReadFromTextStream(is);
    sort(callOnly.begin(), callOnly.end(), LessById);
  }

  vec<HoIntervalWithId> dontCall;
  if (!DO_NOT_CALL.empty()) {
    Ifstream(is, DO_NOT_CALL);
    dontCall.ReadFromTextStream(is);
    sort(dontCall.begin(), dontCall.end(), LessById);
    FatalErr("DO_NOT_CALL is not currently implemented.");
  }

  vec<PolyCall> truth;
  if (!TRUTH.empty()) {
    //if (!TRUTH_ONLY) FatalErr("only implemented with TRUTH_ONLY");
    Ifstream(is, TRUTH);
    truth.ReadFromTextStream(is);
  }
  sort(truth.begin(), truth.end());

  RemoveSecondFromFirst(callOnly, dontCall);
  vec<HoIntervalWithId>::iterator callIter = callOnly.begin();
  vec<PolyCall>::iterator truthIter = truth.begin();
  
  CallBases call(OUTPUT_SNPS, NEED_RC, MIN_READS, MIN_RATIO, SUMMARY, SUMMARY_WINDOW);
  // Read the base counts for each line and call accordingly
  bool allDone = false;
  CoverageMapIterator in(IN);
  for ( ; !done(in); ++in) {
    if (allDone) break;
    call.setContig(contig(in), nBases(in));
    for (ContigCoverageIterator it = contigIt(in); !done(it); ++it) {
      int pos = it->pos;
      int cont = contig(in);
      bool inTruth = false;

      if (truthIter != truth.end()) {
	while (truthIter != truth.end() 
	       && (truthIter->refID < cont ||
		   truthIter->refID == cont && truthIter->posOnRef < pos)) {
	  ++truthIter;      
	}
	if (truthIter == truth.end()) {
	  if (TRUTH_ONLY) { allDone = true; break; }
	}
	else {
	  inTruth = (truthIter->refID == cont && truthIter->posOnRef == pos);
	  if (TRUTH_ONLY && !inTruth) continue;
	}
      }

      if (!callOnly.empty()) {
	while (callIter != callOnly.end() && 
	       ( callIter->id < cont ||
		 callIter->id == cont && callIter->Stop() <= pos)) {
	  ++callIter;
	}
	if (callIter == callOnly.end()) { allDone = true; break; }
	if (callIter->id!=cont || !Member(*callIter, pos)) continue;
      }

      call(*it, inTruth, (inTruth ? truthIter->newBase : 0), reftrue);
    }
  }

  // inform call of where we stopped in case there were no calls in last contig
  call.setContig(contig(in), nBases(in)); 

  // Output short summary
  cout << call << endl;

  if (SUMMARY)
    WriteSummary(cout, call);
  
  if (OUTPUT_SENS_SPEC) {
    if (TRUTH.empty())
      WriteSensSpecReport(cout, call);
    else
      WriteSensSpecReport(cout, call, truth);
  }

  if (reftrue) reftrue->WriteAll(OUTPUT_REF);

  return 0;
}


