// Copyright (c) 2000-2003 Whitehead Institute for Biomedical Research
// 

// CompressedSequence stores each base in 3 bits, 2 to indicate A, C,
// G, or T, and the third to indicate N.  5 of these 3-bit values are
// stored in every short (wasting one bit per 2 bytes).

#include "CompressedSequence.h"
#include "random/Random.h"

const unsigned char char2base[256] = { 
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4, 0 ,  4, 1 ,  4,  4,  4, 2 ,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4, 3 ,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4, 0 ,  4, 1 ,  4,  4,  4, 2 ,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4, 3 ,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 
};


CompressedSequence::CompressedSequence() :
  data_(0), size_(0), extra_(0), self_owned_(1)
{
}

CompressedSequence::CompressedSequence(const basevector& bv ) :
  data_(0), size_(bv.size()), extra_(0), self_owned_(1)
{
  if ( size_ == 0 )
    return;
  
  size_t alloc_size = (size_ + 4) / 5;

  data_ = new unsigned short[ alloc_size ];

  data_[0] = 0;
  int this_i = 0;
  unsigned int bit_offset = 0;

  for ( unsigned int ii = 0; 
	ii < size_;
	++ii ) 
  {
    if ( bit_offset + 3 > sizeof(unsigned short) * 8 )
    {
      data_[++this_i] = 0;
      bit_offset = 0;
    }

    data_[this_i] += ( bv[ii] << bit_offset );

    bit_offset += 3;
  }
}


CompressedSequence::CompressedSequence(const char* char_ptr) :
  data_(0), size_(0), extra_(0), self_owned_(1)
{
  size_ = strlen( char_ptr );

  if ( size_ == 0 )
    return;
  
  size_t alloc_size = (size_ + 4) / 5;

  data_ = new unsigned short[ alloc_size ];

  const char* const vec_end = char_ptr + size_;

  data_[0] = 0;
  int this_i = 0;
  unsigned int bit_offset = 0;

  for ( ; 
	char_ptr != vec_end;
	++char_ptr ) 
  {
    if ( *char_ptr == '*' )
    {
      --size_;
      continue;
    }

    if ( bit_offset + 3 > sizeof(unsigned short) * 8 )
    {
      data_[++this_i] = 0;
      bit_offset = 0;
    }

    data_[this_i] += ( char2base[ static_cast<unsigned char>(*char_ptr) ] << bit_offset );

    bit_offset += 3;
  }
}

CompressedSequence::CompressedSequence(const vec<char>& vec_char) :
  data_(0), size_(vec_char.size()), extra_(0), self_owned_(1)
{
  if ( size_ == 0 )
    return;
  
  size_t alloc_size = (size_ + 4) / 5;

  data_ = new unsigned short[ alloc_size ];

  vec<char>::const_iterator vec_begin = vec_char.begin();
  vec<char>::const_iterator vec_end = vec_char.end();

  data_[0] = 0;
  int this_i = 0;
  unsigned int bit_offset = 0;

  for ( vec<char>::const_iterator base_ptr = vec_begin; 
	base_ptr != vec_end;
	++base_ptr ) 
  {
    if ( *base_ptr == '*' )
    {
      --size_;
      continue;
    }

    if ( bit_offset + 3 > sizeof(unsigned short) * 8 )
    {
      data_[++this_i] = 0;
      bit_offset = 0;
    }

    data_[this_i] += ( char2base[ static_cast<unsigned char>(*base_ptr) ] << bit_offset );
    
    bit_offset += 3;
  }
}

CompressedSequence::CompressedSequence(const CompressedSequence& original) :
  size_(original.size()), extra_(0), self_owned_(1)
{
  size_t alloc_size = (size_ + 4) / 5;

  data_ = new unsigned short[ alloc_size ];

  memcpy( data_, original.data_, alloc_size*sizeof(unsigned short) );
}

CompressedSequence& CompressedSequence::operator= (const CompressedSequence& original)
{
  if ( this == &original)
    return *this;
  
  if ( data_ && self_owned_ )
    delete [] data_;

  size_ = original.size();
  size_t alloc_size = (size_ + 4) / 5;
  extra_ = 0;

  data_ = new unsigned short[ alloc_size ];

  memcpy( data_, original.data_, alloc_size*sizeof(unsigned short) );
  
  self_owned_ = 1;
  
  return *this;
}


const bool operator== (const CompressedSequence& lhs,
		       const CompressedSequence& rhs)
{
  if ( lhs.size_ != rhs.size_ )
    return false;
  
  int bytes_to_compare = ( lhs.size_ + 4 ) / 5 * sizeof(unsigned short);

  if ( memcmp( lhs.data_, rhs.data_, bytes_to_compare ) != 0 )
    return false;

  return true;
}

const bool operator!= (const CompressedSequence& lhs,
		       const CompressedSequence& rhs)
{
  return ( ! ( lhs == rhs ) );
}

CompressedSequence::~CompressedSequence()
{
  if ( data_ && self_owned_ )
    delete [] data_;
}

void CompressedSequence::ReverseComplement()
{
  size_t alloc_size = (size_ + 4) / 5;

  if ( alloc_size == 0 )
    return;

  unsigned short* rc_data;

  const size_t max_local_size = 512;
  unsigned short local_rc_data[max_local_size];

  if ( alloc_size > max_local_size ) 
    rc_data = new unsigned short[alloc_size];
  else
    rc_data = local_rc_data;

  rc_data[0] = 0;
  int rc_i = 0;
  unsigned int rc_bit_offset = 0;

  int this_i = alloc_size-1;
  int this_bit_offset = ((size_-1) % 5) * 3;

  while( this_i >= 0 )
  {
    if ( rc_bit_offset + 3 > sizeof(unsigned short) * 8 )
    {
      rc_data[++rc_i] = 0;
      rc_bit_offset = 0;
    }

    unsigned short encoded_base = ( data_[this_i] >> this_bit_offset ) & 7;

    switch ( encoded_base & 3 ) { 
    case 0:
      rc_data[rc_i] += ( 3 << rc_bit_offset );
      break;
    case 1:
      rc_data[rc_i] += ( 2 << rc_bit_offset );
      break;
    case 2:
      rc_data[rc_i] += ( 1 << rc_bit_offset );
      break;
    case 3:
      rc_data[rc_i] += ( 0 << rc_bit_offset );
      break;
    }

    if ( encoded_base > 3 )
      rc_data[rc_i] += ( 4 << rc_bit_offset );

    rc_bit_offset += 3;
    this_bit_offset -= 3;

    if ( this_bit_offset < 0 )
    {
      --this_i;
      this_bit_offset = 12;
    }
  }
  
  memcpy( data_, rc_data, alloc_size*sizeof(unsigned short) );

  if ( alloc_size > max_local_size )
    delete [] rc_data;
}


void CompressedSequence::SubAsVecChar( vec<char> &vec_char, int begin, int end ) const
{
  vec_char.clear();

  if ( size_ == 0 || begin < 0 || begin > end )
    return;
  
  vec_char.reserve( end - begin);

  int this_i = 0;
  unsigned int shift = 0;
  int base_pos = 0;

  for ( unsigned int vec_i = 0; vec_i < size_; ++vec_i ) {
    if ( base_pos >= end )
      break;
    
    unsigned short encoded_base = ( data_[this_i] >> shift ) & 7;
    if ( base_pos >= begin ) {
      switch ( encoded_base ) {
      case 0:
	vec_char.push_back('A');
	break;
      case 1:
	vec_char.push_back('C');
	break;
      case 2:
	vec_char.push_back('G');
	break;
      case 3:
	vec_char.push_back('T');
	break;
      default:
	vec_char.push_back('N');
	break;
      }
    }
    base_pos++;

    if ( ( shift += 3 ) > sizeof(unsigned short) * 8 - 3 )
    {
      ++this_i;
      shift = 0;
    }
  }
}


vec<char> CompressedSequence::SubAsVecChar( int begin, int end ) const
{
  vec<char> vec_char;
  this->SubAsVecChar( vec_char, begin, end );
  return vec_char;
}


void CompressedSequence::asVecChar( vec<char> &vec_char ) const
{
  this->SubAsVecChar( vec_char, 0, size_ );
}


vec<char> CompressedSequence::asVecChar() const
{
  vec<char> vec_char;
  this->asVecChar( vec_char );
  return vec_char;
}
  

  
void CompressedSequence::asBasevector( basevector &the_basevector ) const
{
  the_basevector.Setsize( size_, 0 );

  if ( size_ == 0 )
    return;

  int this_i = 0;
  unsigned int shift = 0;
 
  for ( unsigned int basevec_i = 0; basevec_i < size_; ++basevec_i ) {
    
    unsigned short encoded_base = (data_[this_i] >> shift ) & 7;
    if ( encoded_base > 3 )
      encoded_base = randomx() % 4;

    the_basevector.Set( basevec_i, encoded_base );

    if ( ( shift += 3 ) > sizeof(unsigned short) * 8 - 3 )
    {
      ++this_i;
      shift = 0;
    }
  }
}

basevector CompressedSequence::asBasevector() const
{
  basevector the_basevector;
  this->asBasevector( the_basevector );
  return the_basevector;
}


void CompressedSequence::getAmbBases( bitvector &the_amb_bases ) const
{
  the_amb_bases.Setsize( size_, 0 );
  the_amb_bases.Zero();

  if ( size_ == 0 )
    return;

  int this_i = 0;
  unsigned int shift = 0;
 
  for ( unsigned int basevec_i = 0; basevec_i < size_; ++basevec_i ) {
    
    unsigned short encoded_base = (data_[this_i] >> shift ) & 7;
    if ( encoded_base > 3 )
      the_amb_bases.Set( basevec_i, True );

    if ( ( shift += 3 ) > sizeof(unsigned short) * 8 - 3 )
    {
      ++this_i;
      shift = 0;
    }
  }
}


String CompressedSequence::asString() const
{
  String the_string;

  if ( size_ == 0 )
    return the_string;

  the_string.resize(size_);

  int this_i = 0;
  unsigned int shift = 0;
   
  for ( unsigned int string_i = 0; string_i < size_; ++string_i ) {

    unsigned short encoded_base = ( data_[this_i] >> shift ) & 7;
    switch ( encoded_base ) {
    case 0:
      the_string[string_i] = 'A';
      break;
    case 1:
      the_string[string_i] = 'C';
      break;
    case 2:
      the_string[string_i] = 'G';
      break;
    case 3:
      the_string[string_i] = 'T';
      break;
    default:
      the_string[string_i] = 'N';
      break;
    }

    if ( ( shift += 3 ) > sizeof(unsigned short) * 8 - 3 )
    {
      ++this_i;
      shift = 0;
    }
  }
      
  return the_string;
}

const int CompressedSequence::size() const
{
  return size_;
}

const int CompressedSequence::real_size() const
{
  return ( size_ + extra_ + 4 ) / 5 * sizeof(unsigned short);
}

// serf functions (c.f. Feudal.h)

void CompressedSequence::SetExternal( const char* start_of_static_data,          
				      unsigned short* pointer_to_dynamic_data,
				      int size_of_dynamic_data,
				      int extra_space_for_dynamic_data )
{
  memcpy( &size_, start_of_static_data, sizeof(unsigned int) );
  data_ = pointer_to_dynamic_data;
  // To get the total extra space available, we start with the extra
  // space in the last short used by actual data.
  extra_ = 5 - ( size_ % 4 + 1 );
  // Then we add room for 5 more bases for every 2 bytes of extra
  // space we're given.
  extra_ += (( extra_space_for_dynamic_data + 1 )/2) * 5;
  self_owned_ = 0;
}

void CompressedSequence::ShiftStartOfDynamicData( unsigned short* new_start,
						  unsigned short* old_start )
{
  ForceAssert( !self_owned_ );
  
  data_ += new_start - old_start;
}

void CompressedSequence::Reinitialize()
{
  if ( data_ && self_owned_ )
    delete [] data_;
  this->Blank();
}

void CompressedSequence::Blank()
{
  data_ = 0;
  size_ = 0;
  extra_ = 0;
  self_owned_ = 1;
}

template<> bool IsGoodFeudalFile<CompressedSequence>( const String & filename,
    const CompressedSequence * dummy, bool verbose, bool ok3) {
  if (!IsGoodFeudalFile(filename, verbose, ok3)) return false;

  mv_file_control_block control;
  int fd = Open( filename, O_RDONLY );
  read( fd, &control, sizeof(control) );
  close(fd);
  if (control.IsOldFormat()) return true;

  if ( control.vecSize() != 16 && control.vecSize() != 12) {
    if (verbose) {
      cerr << "IsGoodFeudalFile: Failed vecSize test" << endl;
      cerr << "Should be 16 (12 for old files), but is "
	<< int(control.vecSize()) << endl;
    }
    return false;
  }
  return true;
}
						      

#include "FeudalTemplate.h"

#ifdef __DECCXX_VER
#pragma define_template mastervec<CompressedSequence, unsigned short>
#else
INSTANTIATE_MASTERVEC( CompressedSequence, unsigned short )
#endif
