from Bio import SeqIO
import json
import logging
from pyfaidx import Fasta

logger = logging.getLogger(__name__)

#-------------------------------------------------------------------------
# build abbreviated reference
#-------------------------------------------------------------------------
def buildAbbrevRef(
  cloudCoord_list,
  refFasta_path,
  newRef_path,
  MAX_EXTEND_LEN=2000,
  uid=None
):

  logger.info('creating new abbrv reference')

  ref_map = Fasta(refFasta_path)

  # ensure sorted
  cloudCoord_list.sort()

  newRefCoord_map = {}

  logger.info('  - extracting fasta subsequences')
  with open(newRef_path, 'w') as f:
    step = max(1, len(cloudCoord_list) / 10)
    for (cid, (chrName, minpos, maxpos)) in enumerate(cloudCoord_list):
      if cid % step == 0:
        logger.info('   - passed ({0}/{1})'.format(cid, len(cloudCoord_list)))

      chrSize = len(ref_map[chrName])

      # examine left neighbor cloud
      lgap = MAX_EXTEND_LEN * 2
      if cid != 0:
        (lchrName, lminpos, lmaxpos) = cloudCoord_list[cid-1]
        if (lchrName == chrName):
          lgap = min(lgap, minpos - lmaxpos)

      # examine right neighbor cloud
      rgap = MAX_EXTEND_LEN * 2
      if cid != len(cloudCoord_list) - 1:
        (rchrName, rminpos, rmaxpos) = cloudCoord_list[cid+1]
        if (rchrName == chrName):
          rgap = min(rgap, rminpos - maxpos)

      lext = lgap / 2
      rext = rgap / 2
      beginPos = max(minpos - lext, 0)
      endPos   = min(maxpos + rext, chrSize - 2)

      cloudSubSeq = ref_map[chrName][beginPos:endPos + 1]
      # assign a new header to reference this cloudID
      seqName = 'cloud-{0}{1}'.format(
        cid,
        '' if uid == None else '-{0}'.format(uid) ,
      ) + str(cid)
      cloudSubSeq_fasta = '>' + seqName + '\n'
      fragSize = 74
      for frag in [cloudSubSeq[i:i+fragSize] for i in \
        xrange(0,len(cloudSubSeq),fragSize)]:
        cloudSubSeq_fasta += str(frag) + '\n'

      # append new fasta template
      f.write(cloudSubSeq_fasta)

      # save base coordinates for this cloud so can map resulting
      # coordinates back to the original reference coordinates
      newRefCoord_map[seqName] = [chrName, beginPos, endPos]

  return newRefCoord_map

