// Copyright (c) 2000-2003 Whitehead Institute for Biomedical Research

// FetchReadsAmb: like FetchReads, but only determines if bases are ambiguous,
// and stores ambiguity information in a vecbitvector.

// If lower_case_instead=True, check instead if bases are lower case, as would be
// the case if RepeatMasker had marked the bases as repeats.

#ifndef FORCE_DEBUG
     #define NDEBUG
#endif

#include <ctype.h>

#include "Bitvector.h"
#include "CoreTools.h"
#include "FastIfstream.h"
#include "FetchReads.h"
#include "FetchReadsAmb.h"
#include "Feudal.h"

void FetchReadsAmb( vecbitvector& b, String fasta_file, Bool lower_case_instead )
{    
     // Scan the file to determine the total number (nseq) of bitvectors to be 
     // generated, and a good upper bound (totalbases) for the total number of bases.

     int nseq = 0;
     longlong totalbases = 0;
     {    fast_ifstream in(fasta_file);
          if ( in.fail( ) ) FatalErr( "Trouble opening " << fasta_file << "." );
          String line, gt( ">" );
          while(1)
          {    getline( in, line );
               if ( in.fail( ) ) break;
               if ( line.Contains( gt, 0 ) ) ++nseq;
               else totalbases += line.size( );    }    }
     b.Reserve( totalbases/32 + nseq, nseq );

     if ( nseq == 0 )
     {    cout << fasta_file << " has size 0 -- I hope this is OK\n";
          return;    }

     // Now generate them.

     ifstream text( fasta_file.c_str( ) );
     char c;
     text.get(c);

     if ( c != '>' ) FatalErr( "File " << fasta_file << " is supposed to be in "
          << "fasta format.  In particular, each sequence should be prefaced by a "
          << "line which starts with >." );
     text.putback(c);
     vector<char> read;
     read.reserve(1000);
     unsigned int i, ia = 0;
     for ( i = 0; ; i++ )
     {    if ( !text ) break;

          // Skip over comment line.

          text.get(c);
          ForceAssert( c == '>' );
          do
          {    ForceAssert(! text.fail() );
               text.get(c);    }
          while( c != '\n' );

          int read_ptr = 0;
          read.resize(0);
          while(text)
          {    text.get(c);
               if ( isspace(c) ) continue;
               if ( c == '>' )
               {    text.putback(c);
                    break;    }
               if ( c != 'A' && c != 'C' && c != 'G' && c != 'T' &&
                    c != 'a' && c != 'c' && c != 'g' && c != 't' && 
                    !ambiguous_base(c) )
                    FatalErr( "FetchReadsAmb: unrecognized character " << c << " in "
                         << fasta_file );
               read.push_back(c);
               ++read_ptr;    }

          // Convert it to a bitvector.

          static bitvector bx;
          bx.Setsize(read_ptr);
          for ( int j = 0; j < read_ptr; j++ )
          {    if ( !lower_case_instead )
               {    if ( read[j] == 'A' || read[j] == 'a' ) bx.Set( j, 0 );
                    else if ( read[j] == 'C' || read[j] == 'c' ) bx.Set( j, 0 );
                    else if ( read[j] == 'G' || read[j] == 'g' ) bx.Set( j, 0 );
                    else if ( read[j] == 'T' || read[j] == 't' ) bx.Set( j, 0 );
                    else bx.Set( j, 1 );    }
               else
               {    if ( islower( read[j] ) ) bx.Set( j, 1 );
                    else bx.Set( j, 0 );    }    }
          b.push_back( bx, 0 );
          ++ia;    }

     b.resize(ia);    }
