/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2006) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

/// \file SffInfo.cc This class contains and gives access to a table
/// of information about flows.  See SffInfo.h.

#include "454/sff/SffInfo.h"
#include "454/sff/SffRead.h"
#include "CoreTools.h"
#include "454/flowdata/AnalysisInfo.h"
#include "ParseSet.h"
#include "454/sff/SffTypes.h"
#include "454/qual/PhredTableReader.h"
#include "Pair.h"


#include <cmath>
#include <sstream>
#include <algorithm>

#include "VecTemplate.h"
BINARY3_DEF(unsigned short);
BINARY3_DEF(short);

using namespace sff;

///Privileged names for certain columns, defined here to avoid confusion.
const String SffInfo::NAME="name";
const String SffInfo::BASES="bases";
const String SffInfo::QUALS="quals";
const String SffInfo::FLOWS="flows";
const String SffInfo::CORR="correspondence";
const String SffInfo::CLIP_QUAL_LEFT = "clip_qual_left"; 
const String SffInfo::CLIP_QUAL_RIGHT = "clip_qual_right";
const String SffInfo::CLIP_ADAP_LEFT = "clip_adap_left"; 
const String SffInfo::CLIP_ADAP_RIGHT = "clip_adap_right"; 
const String SffInfo::FLOW_ORDER = "flow_order"; 
const String SffInfo::SIZE = "size"; 

///Load a SffInfo class from disk.
void SffInfo::Read( const String &filename)  {
  // Read in each type of property vector, using a macro and the type table.
#define SFF_IMPL(Name, MemberType, VecType) \
  Name ## map_.Read(filename + "." #Name)
#include "454/sff/SffInfoTypeTable.h"
#undef SFF_IMPL  

  //Read the string map.
  int stringsSize = 0;
  int fid = OpenForRead(filename+".Info");
  ForceAssert(sizeof(int)==read(fid, &stringsSize, sizeof(int)));
  if (0 == stringsSize) return; //all done, no data.
  //Do not use String here! String::operator std::string() does not
  // work right in the sense that it does not preserve any data after
  // the first '\0'.
  string s(stringsSize,' ');
  ForceAssertEq(int(stringsSize *sizeof(char)),
		read(fid, &s[0], stringsSize * sizeof(char)));
  istringstream iss(s);
  String property, value;
  longlong size;
  while(iss) {
    iss.read((char *) &size, sizeof(longlong));
    property.resize(size);
    iss.read(&property[0], size);

    iss.read((char *) &size, sizeof(longlong));
    value.resize(size);
    iss.read(&value[0], size);

    strings_[property] = value;
  }
  Close(fid);

  ForceAssert(strings_.find(SIZE)!=strings_.end());
  setSize(strings_[SIZE].Int());
}

///Save to disk.
void SffInfo::Write(const String &filename) const
{
  // Write each type of property vector, using a macro and the type table.
#define SFF_IMPL(Name, MemberType, VecType) \
  Name ## map_.Write(filename + "." #Name)
#include "454/sff/SffInfoTypeTable.h"
#undef SFF_IMPL  

  //Finally, write the string map.
  int fid = OpenForWrite(filename+".Info");
  ostringstream oss;
  for (Smap::const_iterator i=strings_.begin(); i != strings_.end(); ++i) {
    longlong size = i->first.size();
    oss.write((char *) &size, sizeof(longlong));
    oss.write(i->first.c_str(), i->first.size());
    size = i->second.size();
    oss.write((char *) &size, sizeof(longlong));
    oss.write(i->second.c_str(), i->second.size());
  }
  int stringsSize = oss.str().size();
  ForceAssert(sizeof(int) == write(fid, &stringsSize, sizeof(int)));
  ForceAssert(int(stringsSize *sizeof(char)) ==
		write(fid, oss.str().c_str(), stringsSize * sizeof(char)));
  Close(fid);
}

vec<int> GetReferenceSizes(const SffInfo & info, const String & key) {
  vec<int> ret;
  if (!info.HasStringProperty("referenceSizes")) {
    ForceAssert(info.HasStringProperty("analinfo"));
    AnalysisInfo ai(0, info.GetString("analinfo"));
    String ref= ai.GetReference(key);

    //if no reference, return an empty vector.
    if (ref.empty()) return ret;
    if (!ref.Contains(".fastb")) {
      ref += ".lookuptable.fastb";
    }
    RequireRegularFile(ref);
    vecbasevector b(ref);
    String s="{";
    for (int i=0; i != b.size(); ++i) {
      s+= ToString(b[i].size());
      s+=",";
    }
    s+="}";
    //cheat and const_cast because we are essentially caching.
    SffInfo & mutableInfo = const_cast<SffInfo &>(info);
    mutableInfo.SetString("referenceSizes",s);
  }
  ParseIntSet(info.GetString("referenceSizes"), ret, false);
  return ret;
}

sff::CommonHeader SffInfo::ReadFromSff(const String & fname, 
				       bool allBasesFromFlows) {
  ForceAssert(empty());

  // Once through the file, rapidly, to determine how much memory we need to
  // allocate for the results.
  pair<int, int> counts;
  {
    Ifstream(is, fname);
    counts = sff::CountBasesAndNames(is);
  }
  // Now we read it for real. 
  Ifstream(is, fname);

  sff::CommonHeader chead;
  chead.Read(is);

  SetString(FLOW_ORDER, 
	    string(chead.flow_chars.begin(), chead.flow_chars.end()));
  const int N = chead.number_of_reads;
  setSize(N);
  // Average number of bases per read, rounded up to nearest int: needed for
  // reserving space via GetNew...Vector.
  const int avgBases = (counts.first+N-1) / N;
  // Corresponding quantity for names
  const int avgName = (counts.second+N-1) / N;

  //Get and initialize the mastervecs.  Initializer defines the
  //average amount of space per read.
  ShortvecVector & flows = GetNewShortvecVector(FLOWS,
    serfvec<unsigned short>(chead.number_of_flows_per_read));
  BasevecVector & bases = GetNewBasevecVector(BASES, basevector(avgBases));
  QualvecVector & quals = GetNewQualvecVector(QUALS, qualvector(avgBases));
  ShortvecVector & corr = GetNewShortvecVector(CORR, vecushort(avgBases));
  StringVector & names = GetNewStringVector(NAME, String(avgName));
  ShortVector & clipQualLeft = GetNewShortVector(CLIP_QUAL_LEFT, 0);
  ShortVector & clipQualRight = GetNewShortVector(CLIP_QUAL_RIGHT, 0);
  ShortVector & clipAdapLeft = GetNewShortVector(CLIP_ADAP_LEFT, 0);
  ShortVector & clipAdapRight = GetNewShortVector(CLIP_ADAP_RIGHT, 0);

  ReadHeader header;
  ReadData datum;
  basevector b;
  for (int i=0; i != N; ++i) {
    ReadOne(is, header, datum, chead); 
    // Reset the clips. They come in as 1-based, closed intervals with
    // 0 meaning no clip, and come out as 0-bases, half-open intervals
    // with NO_CLIP meaning no clip.
    //Note that because NO_CLIP==-1, the ternary expression is not 
    //really needed, but the logic is clearer this way.
    clipQualLeft[i] = (0 == header.clip_qual_left) ? NO_CLIP 
      : header.clip_qual_left -1;
    clipQualRight[i] = (0 == header.clip_qual_right) ? NO_CLIP 
      : header.clip_qual_right;
    clipAdapLeft[i] = (0 == header.clip_adapter_left) ? NO_CLIP 
      : header.clip_adapter_left - 1;
    clipAdapRight[i] = (0 == header.clip_adapter_right) ? NO_CLIP 
      : header.clip_adapter_right;
    b.SetFromStringWithNs(datum.bases);
    AssertLt(clipQualLeft[i], b.isize());
    AssertLe(clipQualRight[i], b.isize());
    AssertLt(clipAdapLeft[i], b.isize());
    AssertLe(clipAdapRight[i], b.isize());

    bases.push_back(b);
    quals.push_back(datum.quals);
    flows.push_back(datum.flows);
    serfvec<unsigned short> c(datum.correspondence.size());
    ForceAssertGt(datum.correspondence.size(), 0);
    c[0] = datum.correspondence[0] - 1;//make it 0-based, not 1-based.
    //transform correspondence from relative to absolute coordinates.
    for (int i=1; i < c.size(); ++i)
      c[i] = c[i-1] + datum.correspondence[i];
    corr.push_back(c);
    names.push_back_reserve(header.name);
  }
  PRINT2(names.rawsize(), counts.second);
  PRINT2(names.size(), size());
  ForceAssertEq(quals.rawsize(), counts.first);
  
  if (allBasesFromFlows) AllBasesFromFlows();

  return chead;
}

void SffInfo::WriteFasta(const String & s, bool useClip) {
  Ofstream(os, s);
  WriteFasta(os, useClip);
}

void SffInfo::WriteFasta(ostream & os, bool useClip) {
  for (SffRead r(*this); r.Index() != longlong(size()); ++r) {
    basevector b = r.Bases();
    if (!useClip) {
      r.Bases().Print(os, r.Name());
    } else {
      basevector b;
      pair<unsigned short, unsigned short> clips = r.Clips();
      b.SetToSubOf(r.Bases(), clips.first, clips.second - clips.first);
      b.Print(os, r.Name());
    }
  }
}

void SffInfo::WriteQuals(const String & s, bool useClip) {
  Ofstream(os, s);
  WriteQuals(os, useClip);
}

void SffInfo::WriteQuals(ostream & os, bool useClip) {
  for (SffRead r(*this); r.Index() != longlong(size()); ++r) {
    if (!useClip) {
      Print(os, r.Quals(), r.Name());
    } else {
      qualvector q;
      pair<unsigned short, unsigned short> clips = r.Clips();
      q.SetToSubOf(r.Quals(), clips.first, clips.second - clips.first);
      Print(os, q, r.Name());
    }
  }
}

///Called only from AllBasesFromFlows.
/// Kludge: if there are three bases in a row with no calls, and one of 
/// them is sufficiently high and better than the others, raise it to 50
/// so it gets called, but with low quality.
/// Note: I also tried raising any of them that were above 40, but that 
/// worked a little bit worse, so we keep the criterion of only raising
/// one of them, and only if it is clearly higher than the others.
void FixDots(SffRead & r) {
  serfvec<unsigned short> & flows = r.Flows();
  serfvec<unsigned short>::iterator i, top;
  for ( i = flows.begin(); i != flows.end() - 3; ++i) {
    top = max_element(i,i+3);
    if (*top >= 50) continue;
    
    unsigned short bottom = *min_element(i, i+3);
    unsigned short mid = accumulate(i, i+3, 0) - *top - bottom;
    if (*top >=40 && mid < 40) { //fix the dot: give top the value 50. 
      *top = 50; 
    }
  }
}

void SffInfo::AllBasesFromFlows(const int MAX_HEIGHT, const int MAX_BASES) {
  SffRead r(*this);
  const int NFLOWS = r.Flows().size();
  String flowOrder = GetString(FLOW_ORDER);
  ForceAssertEq(longlong(flowOrder.size()), NFLOWS);

  std::string bases;//use little-s string so we can use push_back;
  serfvec<unsigned short> correspondence;
  for ( ; r.Index() != longlong(size()); ++r) {
    FixDots(r);
    correspondence.clear();
    bases.clear();
    for (int j=0; j != NFLOWS; ++j) {
      int nbases = sff::Call(r.Flows()[j]);
      for (int height =0; height < nbases ; ++height) {
	correspondence.push_back(j);
	bases.push_back(flowOrder[j]);
      }
      if (longlong(bases.size()) > MAX_BASES) {
	//Something went very wrong. Don't crash, but notify the user.
	cout << "Problem: read " << r.Index() << " " << r.Name()
	     << " has more than " << MAX_BASES << " bases" << endl;
	bases.clear();
	correspondence.clear();
      }
    }
    //PRINT3(r.Index(), r.Name(), bases);
    //fix clips first because we need the old Bases() and Corr().
    //r.Bases().Print(cout);
    //copy(r.Corr().begin(), r.Corr().end(), ostream_iterator<int>(cout," ")); cout << endl;
    //copy(correspondence.begin(), correspondence.end(), ostream_iterator<int>(cout," ")); cout << endl;
    //cout << bases << endl;
    r.FixClips(correspondence);
    r.Bases().SetFromString(bases);
    r.Corr() = correspondence;
    r.Quals().resize(bases.size());
    fill(r.Quals().begin(), r.Quals().end(), 0);
  }
}

void SffInfo::SetQualsFrom(const PhredTableReader & p, unsigned char MIN, 
			   unsigned char MAX, int SUBTRACT) {
  cout << "Setting quals for " << size() << " reads in groups of 1000\n";
  for (SffRead r(*this); r.Index() != isize(); ++r) {
    DotMod(cout, r.Index(), 1000);
    r.SetQualsFrom(p, MIN, MAX, SUBTRACT);
  }
}

