/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2006) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

#include "Basevector.h"
#include "Qualvector.h"
#include "util/QualInfo.h"
#include "lookup/LookAlign.h"
#include "lookup/LookAlignSort.h"
#include "lookup/VecFromLookAlign.h"
#include "math/Functions.h"

void QualInfo::PrintErrors( ostream & os)
{ 
  const longlong bases  = Sum(count_);
  os << "bases " << bases << ", insertions " << i_ << ", deletions " << d_ << ", mutations " << m_
     << ", quality " << -10*log10(double(i_+d_+m_)/bases)
     << endl;
}


// The key difference between this and the standard r-squared calculation
// is that we define the expected value not from the regression line 
// of the data, but from the ideal y=x regression line when we calculate
// the error sum of squares sse and the total sum of squares sst.
float QualInfo::R2ideal(int start, int end) {
  AssertLe(start, end);
  AssertGe(start, 0);
  if (end < 0) end = count_.size();
  const int BAD=-1;
  vec<float> observed(end, BAD);//mark all bins as having no data
  float mean = 0;
  int goodObserved = 0;
  for (int i=start; i != end; ++i) {
    if (count_[i]) {
      observed[i] = QualScoreFloat(count_[i] - errors_[i], errors_[i]);
      mean += observed[i];
      ++goodObserved;
    }
  }
  mean /= goodObserved;
  //PRINT(mean);
  //observed.Println(cout);
  float sse = 0, sst = 0;
  for (int i=start; i != end; ++i) {
    if (BAD != observed[i]) { //do not use bins with no data
      sse += Square(observed[i] - i);
      sst += Square(observed[i] - mean);
    }
    //PRINT3(i, sse, sst);
  }
  return 1 - sse/sst;
}
    
float QualInfo::ActualQGreater(int q) {
  AssertGe(q, 0);

  longlong acc=0;
  longlong total = accumulate(count_.begin(), count_.end(), acc);

  int good = 0;
  for (int i=0; i != count_.isize(); ++i) {
    if (count_[i]) {
      float thisq = QualScoreFloat(count_[i] - errors_[i], errors_[i]);
      if (thisq < q) good = 0;
      else good += count_[i];
    }
  }
  return double(good) / total;
}
    
        
void QualInfo::DataForGraph(ostream & out, int lowest, int highest) {
  AssertGe(lowest, 0);
  AssertLt(highest, count_.isize());

  out << "predictedQ\tproportionOfTotal\tactualQ\n";

  longlong totcount=0, toterrors=0;

  longlong acc=0;
  longlong total = accumulate(count_.begin(), count_.end(), acc);

  for (int i=0; i != lowest; ++i) {
    totcount += count_[i];
    toterrors += errors_[i];
  } 
  for (int i=lowest; i != highest; ++i) {
    totcount += count_[i];
    toterrors += errors_[i];
    out << i << "\t" 
	<< double(totcount)/total << "\t"
	<< QualScoreFloat(totcount - toterrors, toterrors) << "\n";
    totcount = toterrors = 0;
  }
  for (int i=highest; i != count_.isize(); ++i) {
    totcount += count_[i];
    toterrors += errors_[i];
  }
  out << highest << "\t" 
      << double(totcount)/total << "\t"
      << QualScoreFloat(totcount - toterrors, toterrors) << "\n";
  totcount = toterrors = 0;
  
}

void QualInfo::PrintBins(ostream & out, int binsize,
			 int lowest, int highest) {
  AssertGe(lowest, 0);
  AssertLt(highest, count_.isize());
  longlong totcount=0, toterrors=0;

  vec<Bin> bins;

  for (int i=0; i != lowest; ++i) {
    totcount += count_[i];
    toterrors += errors_[i];
  } 
  for (int i=lowest; i != highest; ++i) {
    totcount += count_[i];
    toterrors += errors_[i];
    if (i % binsize == binsize -1 
	|| i == count_.isize() -1) {
      bins.push_back(Bin(i - (i % binsize), i, totcount, toterrors));
      totcount = toterrors = 0;
    }
  }
  for (int i=highest; i != count_.isize(); ++i) {
    totcount += count_[i];
    toterrors += errors_[i];
  }
  if (highest != count_.isize() - 1) {
    bins.push_back(Bin(highest - (highest % binsize), count_.isize(),
		       totcount, toterrors));
  }

  vec<double> cPercent, cQual;
  CumulativePercent(bins, cPercent, cQual);
  //PRINT(cPercent);
  for (int i=0; i != bins.isize(); ++i) {
    Bin & b = bins[i];
    PrintBin(out, b.start, b.end, b.count, b.errs, cPercent[i], cQual[i]);
  }
}

void QualInfo::CumulativePercent(const vec<Bin> & bins, vec<double> &percent, vec<double> &qual) {
  percent.resize(bins.size(), 0);
  qual.resize(bins.size(), 0);
  longlong totalcount=0, currcount, currerrs=0;
  for (int i=0; i != bins.isize(); ++i) {
    totalcount += bins[i].count;
    currerrs += bins[i].errs;
  }
  currcount=totalcount;
  //PRINT(totalcount);
  for (int i=0; i != bins.isize(); ++i) {
    qual[i] = -10*log10(EstProbability(currerrs, currcount));
    percent[i] = (100.0*currcount)/totalcount;
    currcount -= bins[i].count;
    currerrs -= bins[i].errs;
  }
}
      
void QualInfo::PrintBin(ostream & out, int start, int end, longlong totcount,
			longlong toterrors, double cumulativePercent, double cumulativeQual) {
  double proberr = EstProbability(toterrors, totcount);
  pair<double,double> errCI = BinomialConfidenceInterval(toterrors, totcount, 0.05);
  out.precision(2);
  out << start << "-" << end
      << "\t" << totcount << "\t" << toterrors 
      << "\t" << toterrors*100.0/totcount
      << "\t" << -10*log(proberr)/log(10)
      << "\t" << -10*log(errCI.second)/log(10)
      << "-" << -10*log(errCI.first)/log(10)
      << "\t" << cumulativeQual;
  out.precision(3);
  out << "\t" << cumulativePercent << "\n";
  }  

template<class V>  
void CleanReadIncorr(V & readIncorr, bool keep, const int errType) {
  if (keep) return;
  for (int i=0; i != readIncorr.isize(); ++i) {
    if (errType == readIncorr[i]) readIncorr[i] = AlignErr::MATCH;
  }
}
  
void QualInfo::AddAlignment(const look_align_plus & la, 
			    const basevector & read,
			    const qualvector & q,
			    const basevector & ref,
			    errType etype,
			    int EXCLUDE_ENDS) {
  if (read.size() != la.query_length || ref.size() !=  la.target_length) {
    PRINT2(la.query_id, la.target_id);
    PRINT4(read.size(), la.query_length, ref.size(), la.target_length);
    FatalErr("bad match, can't continue\n");
  }


  vec<unsigned char> readIncorr;
  vec<int> delCount;
  VecFromLookAlign(readIncorr, delCount, read, ref, la);
  SetDeletionsWithQuals(readIncorr, delCount, q);
  RemoveErrorsAtEnds(readIncorr,EXCLUDE_ENDS);

  CleanReadIncorr(readIncorr, etype & QualInfo::MIS, AlignErr::MISMATCH);
  CleanReadIncorr(readIncorr, etype & QualInfo::DEL, AlignErr::DELETION);
  CleanReadIncorr(readIncorr, etype & QualInfo::INS, AlignErr::INSERTION);
  
  if (q.size() != int(read.size())) {
    PRINT4("size mismatch!",la.query_id,q.size(), read.size());
  }

  for ( int b=0; b != readIncorr.isize(); ++b) {
    switch (readIncorr[b]) {
    case AlignErr::NO_BASE: 
      break;
    case AlignErr::MATCH: 
      AddInfo(q[b],true);
      break;
    case AlignErr::MISMATCH: 
      AddMutation();
      AddInfo(q[b],false);
      break;
    case AlignErr::INSERTION: 
      AddInsertion();
      AddInfo(q[b],false);
      break;
    case AlignErr::DELETION: 
      AddDeletion();
      AddInfo(q[b],false);
      break;
    default:
      FatalErr("Unknown value " << int(readIncorr[b])
		 << "in readIncorr at base " << b);
      break;
    }
  }
}




