/////////////////////////////////////////////////////////////////////////////
//                   SOFTWARE COPYRIGHT NOTICE AGREEMENT                   //
//       This software and its documentation are copyright (2007) by the   //
//   Broad Institute/Massachusetts Institute of Technology.  All rights    //
//   are reserved.  This software is supplied without any warranty or      //
//   guaranteed support whatsoever. Neither the Broad Institute nor MIT    //
//   can be responsible for its use, misuse, or functionality.             //
/////////////////////////////////////////////////////////////////////////////

// File: Basevector.h
//
// This file defines the class "basevector", which stores strings from the
// alphabet {A,C,G,T}, four characters (bases) per byte.

#ifndef BASEVECTOR_H
#define BASEVECTOR_H

#include "CoreTools.h"
#include "Feudal.h"
#include "random/Random.h"
#include "SemanticTypes.h"

#include "charTranslations.h"

// Section: DNA base representation

// Enum: base_t
//
// Compact numeric representation of one DNA base.
//
//    BASE_A - adenine
//    BASE_C - cytosine
//    BASE_G - guanine
//    BASE_T - thymine
//
SemanticTypeStd(unsigned char, base_t);

const base_t BASE_A = 0;
const base_t BASE_C = 1;
const base_t BASE_G = 2;
const base_t BASE_T = 3;

/// Function: as_base
/// Converts an integer representation of a base to the one-letter name of the base.
inline
char
as_base(unsigned char x)
{
  if ( x == 0 ) 
    return 'A';
  if ( x == 1 ) 
    return 'C';
  if ( x == 2 )
    return 'G';
  if ( x == 3 )
    return 'T'; 
  else FatalErr("Incorrect integer value for base " << x);
}

// Function: IsGC
// Tests whether a given <base> is a G or a C.
// See also: <gc bias>.
inline Bool IsGC( unsigned char x )
{    return ( x == 1 || x == 2 );    }

/// Function: as_char
/// Convert the one-letter textual name of a base to its integer representation (inverse to <as_base()>).
inline
unsigned char
as_char(char x)
{ 
  if ( x == 'a' || x == 'A' )
    return 0;
  if ( x == 'c' || x == 'C' ) 
    return 1;
  if ( x == 'g' || x == 'G' ) 
    return 2;
  if ( x == 't' || x == 'T' )     
    return 3;   
  else FatalErr("Incorrect letter for base " << x);
}

/// Function: GetComplementaryBase
/// Return the base complementary to the given one, by the Watson-Crick base pairing.
inline base_t GetComplementaryBase(base_t base) {
  switch(base) {
  case BASE_A: return BASE_T;
  case BASE_C: return BASE_G;
  case BASE_G: return BASE_C;
  case BASE_T: return BASE_A;
  default: ForceAssert(0); return (base_t)4;
  }
}

// Section: Vectors of bases

// ========================================================================

/** 
    Class: basevector

    Class to efficiently store a string of bases.

    The basevector class stores a string of bases, i.e. a string from 
    {A,C,G,T}, four per byte, according to the conversion (base two)

       A - 00
       C - 01 
       G - 10
       T - 11

    Can represent, for example: a read; a <genome part>; the sequence of one <kmer>,
    or of a <kmer path>.

    See also: <vecbasevector>.
*/
class basevector
{
public:

  friend class BasevectorIterator;
  friend class BasevectorReverseIterator;
  friend class BasevectorReverseComplementIterator;
  
  basevector( ) 
  {
    data_ = 0;
    length_ = 0;
    extra_space_and_self_owned_ = TopBit32;
  }


  basevector(const basevector& b);


  explicit basevector( unsigned int n, int extra = 0 )
    : length_(n)
  { 
    int words_needed = (n + extra + 15) / 16;
    int actual_extra = words_needed * 16 - n;
    extra_space_and_self_owned_ = TopBit32 | actual_extra;
    data_ = new unsigned int[words_needed];   
  }

  ~basevector( ) 
  {
    if ( SelfOwned( ) && data_ != 0 )
      delete [ ] data_;
    data_ = 0;
    length_ = 0;
    extra_space_and_self_owned_ = TopBit32;   
  }

  bool empty() const { return 0 == size(); }

  unsigned int size() const
  {  return length_;  }

  int isize() const
  {  return length_;  }


  /// resize, keeping contents (or part, if downsized);
  /// this never reduces space usage.

  void resize(unsigned int n);
  inline void resize(longlong n)
  {    AssertGe( n, 0 );
       resize( (unsigned int) n );    }
  inline void resize(ulonglong n)
  {    resize( (unsigned int) n );    }
  inline void resize(int n)
  {    AssertGe( n, 0 );
       resize( (unsigned int) n );    }

  /// analog of vector::swap()
  void Swap( basevector& b )
  {
    swap( data_, b.data_ );
    swap( length_, b.length_ );
    swap( extra_space_and_self_owned_, b.extra_space_and_self_owned_ );
  }


  basevector& operator = (const basevector& b);


  /// Return the i-th element of a basevector (for read access only).
  unsigned char operator [ ] (unsigned int i) const
  {
    AssertLt(i, length_);
    const unsigned char byte = DataAsBytes( i>>2 );
    switch (i & 0x3)
      {
      case 0:
        return byte & 0x3;
      case 1:
        return (byte >> 2) & 0x3;
      case 2:
        return (byte >> 4) & 0x3;
      case 3:
        return (byte >> 6) & 0x3;
      }
    return 0; // should never happen
  }


  char at(const unsigned int i) const
  {  return as_base( (*this)[i] );  }


  ///Return true if all bases are equal or if empty.
  bool IsHomopolymer() const {
    if (empty()) return true;
    unsigned char firstbase = (*this)[0];
    for (unsigned i=1; i != size(); ++i) {
      if ((*this)[i] != firstbase) return false;
    }
    return true;
  }

  /// Return the % of highest base and which base it is.
  pair<float, unsigned char> HomopolPercent() const {
    pair<float, unsigned char> ret(-1, 255);
    if (empty()) return ret;

    float count[4] = {0,0,0,0};
    for (unsigned i=0; i != size(); ++i) {
      ++count[(*this)[i]];
    }
    float * mx = max_element(count, count+4);
    ret.first = *mx * 100.0 / size();
    ret.second = mx - count;
    return ret;
  }

  /// Return homopolymer count at this base.
  int Homopol(int base) const {
    const unsigned char b= (*this)[base];
    int ret=1;
    for (int i=base -1; i >=0; --i) {
      if (b == (*this)[i]) ++ret;
      else break;
    }
    for (int i=base +1; i < isize(); ++i) {
      if (b == (*this)[i]) ++ret;
      else break;
    }
    return ret;
  }
    
  void SetData( const unsigned int* data )
  {
    for ( unsigned int i = 0; i < (length_ + 15)/16; i++ )
      data_[i] = data[i];   
  }

  void SetData( const char* data )
  {
    memcpy( data_, data, (length_ + 3)/4 );   
  }


  void Set(unsigned int i, unsigned char base)
  {
    AssertLt( i, length_ );
    AssertLt( base, 4 );
    unsigned char& target = DataAsBytes( )[ i>>2 ];
    int shift = (i&3) << 1;
    target = ( target & ( ~(3 << shift) ) ) | ((base&3) << shift);
  }

  void SetFromString(const String &s) 
  {
    int n = (int)s.size();
    Setsize ( n );
    if (0 == n) return;
    AssertGt  ( n,0 );
    for (int i=0; i<n; ++i) 
      {   
	Assert(char2num[ (unsigned int) s[i] ] <= 3 );
	Set( i, char2num[ (unsigned int) s[i] ] );  
      }
  }

  /// Set from a string containing non-ACGTacgt characters.
  /// Any such characters are assigned a random base.
  void SetFromStringWithNs(const String &s) {
    int n = (int)s.size();
    Assert  ( n>0 );
    Setsize ( n );
    int base;
    for (int i=0; i<n; ++i) {   
      base = char2num[ (unsigned int) s[i] ];
      if (base > 3 ) {
	base = randomx() % 4;
      }
      Set( i, base );  
    }
  }


  /// Translate basevector to String
  String ToString( ) const;

  /// Compute the concatenation of two or three basevectors:

  friend basevector Cat( const basevector& left, const basevector& right );
  friend basevector Cat( const basevector& b1, const basevector& b2,
                         const basevector& b3 );

  friend 
  void CopyBases( const basevector& from,
		  int from_start,
		  basevector& to,
		  int to_start,
		  int count,
		  Bool rc_from );

  /// SetToSubOf(orig_bv, start_pos, len):  Set *this to the length len 
  /// sub-basevector of  orig_bv, starting at position  start_pos.  The 
  /// case where *this = orig_bv is allowed.

  void SetToSubOf(const basevector& orig_bv,
		  unsigned int start_pos,
		  int len,
		  int extra = 0);


  /// Replace *this by its reverse complement.
  void ReverseComplement( );

  /// Replace *this by its reverse (NOT reverse complement!).
  void Reverse( );

  /// Set *this to the reverse complement of b.
  void ReverseComplement( const basevector& b )
  {
    Assert( b.Initialized() );
    *this = b;
    ReverseComplement();
  }

  /// Set *this to the reverse of b (NOT reverse complement!).
  void Reverse( const basevector& b )
  {
    Assert( b.Initialized() );
    *this = b;
    Reverse();
  }


  /// Method: GcBases
  /// Calculate the GC content of a basevector. If end<0 the size of
  /// the basevector is used.  
  unsigned int GcBases( int start = 0, int end = -1) const; 

  /// Enum: CanonicalForm
  ///
  /// Return type of <Canonicalize()>, which tells you what form of the
  /// input basevector was canonical.
  ///
  ///     rc_form - rc of input was canonical
  ///     palindrome - input was palindromic
  ///     fw_form - input was canonica
  enum CanonicalForm {
    rc_form = -1,   
    palindrome = 0, 
    fw_form = 1     
  };

 private:
  CanonicalForm CanonicalizeGeneralImpl() {
    static basevector rc;
    rc.ReverseComplement(*this);
    for ( unsigned int j = 0; j < (length_+3)/4; j++ ) {
      if ( rc.DataAsBytes(j) < this->DataAsBytes(j) ) {
        *this = rc;
        return rc_form;
      }
      if ( rc.DataAsBytes(j) > this->DataAsBytes(j) ) 
        return fw_form;
    }
    return palindrome;
  }

 public:

  /// Method: Canonicalize
  ///
  /// Turn this basevector into its canonical form, according to a
  /// given (and arbitrary) ordering function.  There are three cases:
  ///
  ///  - this basevector is less than its reverse complement - leave
  /// the basevector alone and return fw.
  ///
  ///  - the reverse complement is less than this basevector -
  /// reverse complement the basevector and return rc.
  ///
  ///  - neither is less than the other - leave the basevector alone
  /// and return palindrome
  CanonicalForm Canonicalize() {
    if ( length_ % 4 == 0 ) {
      const int numbytes = length_/4;
      for ( int j = 0,rcj = numbytes-1; j < numbytes; j++,rcj-- )
        if ( this->DataAsBytes(j) < RCtable[this->DataAsBytes(rcj)] ) 
          return fw_form;
        else if ( this->DataAsBytes(j) > RCtable[this->DataAsBytes(rcj)] ) {
          this->ReverseComplement();
          return rc_form;
        }
      return palindrome;
    }
    else 
      return this->CanonicalizeGeneralImpl();
  }


  /// Return the position of other inside ourselves, or size() if not found.
  /// Start looking at start, and stop looking when we reach the
  /// minimum of end or size().
  unsigned int Find(const basevector & other, unsigned int start=0, 
	   unsigned int end=INT_MAX) const;
	

  /// Cap: in a given basevector, replace any sequence of N > n identical
  /// bases by n of the same base.
  void Cap( int n );


  friend
  bool operator == ( const basevector& x, const basevector& y )
  {
    if ( x.size() != y.size() ) 
      return false;
    if ( memcmp( x.DataAsBytes(), y.DataAsBytes(), x.size() / 4 ) != 0 ) 
      return false;
    for ( unsigned int i = (x.size() / 4) * 4; i < x.size(); ++i )
      if ( x[i] != y[i] ) 
        return false;
    return true;
  }

  friend
  bool operator != ( const basevector& x, const basevector& y )
  {  return !(x == y);  }


  /// Define x < y if x is lexicographically less than y:
  friend
  bool operator < ( const basevector& x, const basevector& y )
  {
    for ( unsigned int i = 0; i < x.size( ); i++ )
      { 
	if ( i >= y.size( ) )
	  return false;
	if ( x[i] < y[i] )
	  return true;
	if ( x[i] > y[i] )
	  return false;   
      }
    return x.size( ) < y.size( );   
  }



  /** PrintBases: Select the "nbases" bases starting at "start".  
      If rc = False, print them.  Otherwise print the reverse complement 
      of the bases.
  */
  void PrintBases(ostream& out,
		  int start, 
		  int nbases,
		  Bool rc=False ) const;

  void Print( ostream& out, int id ) const;
  void Print( ostream& out, String id ) const;
  void Print( ostream& out ) const;


  /** Read/write a basevector from a file.  
      The format of the file is first a base
      count, then a newline, then the bases, as in the data_ member 
      of this class.
  */
  friend
  istream& operator>>(istream&, basevector&);

  /** Read/write a basevector from a file.  
      The format of the file is first a base
      count, then a newline, then the bases, as in the data_ member 
      of this class.
  */
  friend
  ostream& operator<<(ostream&, const basevector&);


  /// RawReadBytes and RawWriteBytes (respectively) take bytes from disk (as 
  /// defined by a file descriptor) and stuff them into the data_ member of a
  /// basevector, or do the reverse.  
  void RawReadBytes( int fd, longlong nbytes )
  {
    ForceAssertGe( nbytes, 0 );
    ForceAssertLe( (size_t) nbytes, sizeof(unsigned int) * (size_t) length_ );
    ReadBytes( fd, data_, nbytes );   
  }

  void RawWriteBytes( int fd, longlong nbytes ) const
  {
    ForceAssertGe( nbytes, 0 );
    ForceAssertLe( (size_t) nbytes, sizeof(unsigned int) * (size_t) length_ );
    WriteBytes( fd, data_, nbytes );   
  }

  friend void BinaryWrite( int fd, const basevector& b );
  friend void BinaryRead( int fd, basevector& b );

  //--------------------------------------------------

  // The following methods are provided for compatibility with
  // the implementation of mastervec template in Feudal.
  //

  typedef unsigned int value_type;

  const char* StartOfStaticData( ) const
  {  return (char*) &length_;  }

  const int SizeOfStaticData( ) const
  {  return sizeof(length_);  }

  const unsigned int* StartOfDynamicData( ) const
  {  return data_;  }

  unsigned int* StartOfDynamicData( )
  {  return data_;  }

  longlong SizeOfDynamicData( ) const
  {  return (length_ + 15) / 16;  }

  longlong ExtraSpaceForDynamicData( ) const
  {  return ExtraSpace( );  }

  Bool SelfOwned( ) const
  {  return SelfOwnedBit( ) != 0;  }

  void SetExternal( const char* start_of_static_data, 
                    unsigned int* pointer_to_dynamic_data, 
                    int size_of_dynamic_data, 
                    int extra_space_for_dynamic_data )
  {
    data_ = pointer_to_dynamic_data;
    length_ = *( (unsigned int*) start_of_static_data );
    extra_space_and_self_owned_ = 
      (length_ + 15)/16 * 16 - length_
      + 16 * extra_space_for_dynamic_data;   
  }

  void ShiftStartOfDynamicData( unsigned int* a1, unsigned int* a2 )
  {  data_ += a1 - a2;  }

  void Reinitialize( )
  {   
    if ( SelfOwned( ) && data_ != 0 )
      delete [ ] data_;
    data_ = 0;
    length_ = 0;
    extra_space_and_self_owned_ = TopBit32;   
  }

  void Blank( )
  {
    data_ = 0;
    length_ = 0;
    extra_space_and_self_owned_ = TopBit32;   
  }


  //--------------------------------------------------

  // The following accessors should probably be made private.

  unsigned int SelfOwnedBit( ) const
  {  return extra_space_and_self_owned_ & TopBit32;  }

  unsigned int ExtraSpace( ) const
  {  return extra_space_and_self_owned_ & Bits31;  }


  /** BE VERY SURE YOU KNOW WHAT YOU ARE DOING BEFORE YOU USE THIS METHOD.
      The following method provides unrestricted access to
      the  data_  member of the  basevector  class.  For the
      sake of efficiency, no check is done on the value of
      the argument -- hence there is the high risk of indexing
      out-of-range in using this method.
  */
  unsigned char  DataAsBytes( int n ) const 
  {  return ((unsigned char*) data_)[n];  }

  /** BE VERY SURE YOU KNOW WHAT YOU ARE DOING BEFORE YOU USE THIS METHOD.
      The following method provides unrestricted access to
      the  data_  member of the  basevector  class.  For the
      sake of efficiency, no check is done on the value of
      the argument -- hence there is the high risk of indexing
      out-of-range in using this method.
  */
  unsigned int   DataAsInts( int n ) const 
  {  return data_[n];  }


  unsigned char Initialized( ) const
  {  return data_ != 0;  }


  /// Setsize: resize, destroying contents
  void Setsize(unsigned int n, unsigned int extra = 0)
  {
    if ( data_ != 0 &&
	 length_ == n &&
	 ExtraSpace( ) == extra )
      return;

    if ( data_ == 0 )
      {   
	data_ = new unsigned int[(n + extra + 15)/16];
	length_ = n;
	extra_space_and_self_owned_ = TopBit32 | extra;  
      }
    else if ( length_ + ExtraSpace( ) >= n + extra )
      {   
	extra_space_and_self_owned_ = 
	  SelfOwnedBit( ) | (length_ + ExtraSpace( ) - n);
	length_ = n; 
      }
    else
      {   
	if ( SelfOwned( ) )
	  delete [ ] data_;
	data_ = new unsigned int[(n + extra + 15)/16];
	length_ = n;
	extra_space_and_self_owned_ = TopBit32 | extra;   
      }
  }

private:

  // For full compatibility across 32-bit and 64-bit architectures, we use a 
  // union to pad the data_ pointer so that 8 bytes are always used.  Of course
  // this wastes space on 32-bit systems.

  union {
  /// The bases themselves, stored at 4 bases per character.
       unsigned int *  data_;
       longlong unused_;
  };

  /// The number of bases.
  unsigned int length_;
  /// Keep track of space for more bases and self-ownership:
  unsigned int extra_space_and_self_owned_;

  /// The lookup table used by  ReverseComplement().
  /// It is declared as a  static const  member here,
  /// and it is defined in  Basevector.cc
  static const unsigned char RCtable[256];

  /// The lookup table used by  Get
  static const unsigned char GCtable[256];

  unsigned char* DataAsBytesRW( )
  {  return (unsigned char*) data_;  }


public: //but not really!

  /// DO NOT USE THIS METHOD IN ANY OTHER CODE.
  /// The following method is made public only for SortKmers.
  /// It will be made private once SortKmers is eliminated.
  unsigned char* DataAsBytes( ) const
  {  return (unsigned char*) data_;  }
};

// Section: Related functions

/// Mirrors Print() for qualvectors so we can use templates.
/// Delegates to basevector::Print, param scores_per_line is not used.
inline void Print( ostream &out, const basevector &b, const String &name, 
		   const int scores_per_line = 60 ) {
  b.Print(out, name);
}
  
/**
  Function: CopyBases

  CopyBases: copy bases from one basevector to another; equivalent to
  >     for ( int i = 0; i < count; i++ )
  >           to.Set( to_start + i, from( from_start + i ) );
  but faster.  However, the code could still be made substantially faster.
  If rc_from = True, copy bases from rc(from), starting at from_start on
  rc(from).
*/
void
CopyBases( const basevector& from,
	   int from_start,
	   basevector& to,
	   int to_start,
	   int count,
	   Bool rc_from = False );

/**
   Class: vecbasevector

   A vector of <base vectors>.  Can represent a list of <genome parts>, or a set of <reads> or of <unipaths>.
   Corresponds to a <.fastb> file.

   The order of the basevectors in a vecbasevector normally is arbitrary; we can give each basevector an id
   based on its position in the list (see <genome id>, <read id>) but the order of basevectors in a vecbasevector
   is normally unrelated to their relative order in the genome.
*/
typedef mastervec< basevector, unsigned int > vecbasevector;

// Synonyms: Shorter names for common classes
//   bvec - See <basevector>
//   vecbvec - See <vecbasevector>
typedef basevector bvec;
typedef vecbasevector vecbvec;

// Section: Logical types
//
// There are just two physical types here -- a basevector and a list of basevector (vecbasevector).
// But there are several common semantic roles in which these types are used.
// The logical types below can be used to make the logical/semantic meaning of declarations
// more apparent.

// Type: kmer_t
// Logical type for a <basevector> that represents a single kmer.
// Note that there is also a class <kmer>, which is templatized on kmer
// size and always holds exactly one kmer.  By contrast, kmer_t is just
// a logical type for a basevector that happens to represent a single kmer
// (although the underlying basevector class can represent longer base vectors).
SemanticType( basevector,  kmer_t );

// Type: read_t
// Logical type for a <basevector> that represents a single read.
SemanticType( basevector, read_t );

// Type: genome_part_t
// Logical type for a <basevector> that represents a single <genome part>.
SemanticType( basevector, genome_part_t );

// Type: reads_t
// Logical type for a <vecbasevector> that represents a set of reads.
SemanticType( vecbasevector, reads_t );

// Type: genome_t
// Logical type for a <vecbasevector> that represents a genome, as a list of <genome parts>.
SemanticType( vecbasevector, genome_t );


// Section: Functions

/// Function: StringReverseComplement
/// It stores the RC of seq in rc_seq.
void StringReverseComplement ( const String &seq, String &rc_seq );

/// Calculate the GC content of a basevector. If end<0 the size of b is used.
/// \ingroup grp_gc
unsigned int GcBases( const basevector& b, int start = 0, int end = -1); 

/// Calculate the percent GC in a basevector.  If end<0 the size of b is used.
/// \ingroup grp_gc
float GcPercent( const basevector& b, int start = 0, int end = -1 ); 

/// Calculate the percent GC in a basevector.  If end<0 the size of b is used.
/// \ingroup grp_gc
float GcPercent( const String& b );

// See if two basevectors overlap exactly by r bases.
Bool Overlap( const basevector& s, const basevector& t, int r );

// Reverse complement every sequence in a vecbasevector.

void ReverseComplement( vecbasevector& s );
longlong SizeSum( const vecbasevector& s );

/// Specialization needed because sizeof(basevector) is 12 in old files.
/// (If the files were generated before 2006-08-29 on a 32-bit architecture).
template<> bool IsGoodFeudalFile<basevector>( const String & filename, 
 const basevector * dummy, bool verbose, bool ok3);


#endif

// Synonyms: Various synonyms
//   base - See <base_t>
