#!/bin/env python

#This script processes the AGE output and selects sequences to be output in a text file. 
#In this script two command line arguments are given.
#The first argument is a file containing the list of AGE aligned files.
#The second is the output file.

from __future__ import print_function
import sys
import numpy as np

#This function prints the (found) best sequence to the file named prfSeq. (prf means purrfect.) 
#molName is the name of the moleculo fragment. 
#symStr stands for the string of symbols (the |||.|  ||.|, etc.)
def prntBestSeq(fragName, uSymStr, symStr, lSymStr, seqStrt, seqEnd, exCoL, exCoR, chrom):
  print("{0} Start: {1}".format(fragName, seqStrt), file=prfSeqFile)
  print(uSymStr, file=prfSeqFile)
  print(symStr, file=prfSeqFile)
  print(lSymStr, file=prfSeqFile)
  print("End: {0}, chr{1}, breakpoint coord: ({2}, {3})".format(seqEnd, chrom, exCoL, exCoR), file=prfSeqFile)

#This function processes the sequence that has perfect breakpoint match and satisfied criteria (see below) 
#to be a possible best sequence. 
#Note that the function for loops are constructed in such a way
#that the smLn reads the top sequence (for example) AGAA... in the AGE output. 
#returns the string
def processPrfSeq(seqStrt, seqEnd):
  #This while finds the sequence's start.
  while True:
    smLn = next(ageAln)
    if smLn.strip().startswith(seqStrt):
      break;
  
  lstSt = 0 #this variable will be set to 1 if the end of the sequence is found
  uStr = lStr = jndStr = ""
  while True:
    if smLn.startswith("EXCISED"):
     #sequence is EXCISED REGION; this has to be output.
      jndStr = "".join((jndStr, "EX"))
      uStr = "".join((uStr, "EX"))
      lStr = "".join((lStr, "EX"))
      for i in range(2):
        smLn = next(ageAln)
    else:  
      if smLn.strip().endswith(seqEnd):
        lstSt = 1
      uStr = "".join((uStr, smLn.rstrip(" 0123456789\n")[10:]))
      jndStr = "".join((jndStr, next(ageAln).rstrip("\n")[10:])) #slice from 10, because there are 10 spaces before ||||.||, etc.
      lStr = "".join((lStr, next(ageAln).rstrip(" 0123456789\n")[10:]))
      #set smLn to the next sequence line
      for i in range(2):
        smLn = next(ageAln)
      if lstSt:
        break #out of while loop
  
  return (uStr, jndStr, lStr)
        
def findPrfSeq(lftLst, rhtLst):
  seqStrt = int(lftLst[0])
  seqEnd = int(rhtLst[1])
  exCoL = int(lftLst[1])
  exCoR = int(rhtLst[0])

  seqLft = exCoL-seqStrt+1
  seqRht = seqEnd-exCoR+1
  
  global maxOfMinFlank

  #The variable below is None or holds the string of |||.||, etc.
  (uFondSeq, fondSeq, lFondSeq) = (None, None, None)

  minFlank = min(seqLft, seqRht)
  #Here I check if the breakpoint coordinates reported by AGE perfectly match the ones in the file
  #NA12878_tab.txt. So this signals that a good sequence has been found.
  if (exCoL == coL and exCoR-1 == coR and minFlank > maxOfMinFlank):
    maxOfMinFlank = minFlank
    (uFondSeq, fondSeq, lFondSeq) = processPrfSeq(lftLst[0], rhtLst[1])

  return (uFondSeq, fondSeq, lFondSeq, seqStrt, seqEnd, exCoL, exCoR)


#The file handle refers to the file with the list of 
#AGE aligned files.
ageAlnFlLst = open(sys.argv[1])

#This output file contains all the sequences (from AGE alignment) that have perfectly aligned breakpoints.
#Specify this file as say prfSeqs or some other name. prf for perfect.
prfSeqFile = open(sys.argv[2], "a")

for fle in ageAlnFlLst:
  #the given breakpoint coord. are being extracted from the file
  #name itself.
  flds = fle.split(".")
  chr = flds[2]
  flds1 = flds[3].split("_")
  coL = int(flds1[0]) 
  coR = int(flds1[1]) 

  ageAln = open("MOLAGE_FILES/"+fle.rstrip("\n")) 
  bestSeqData = (None, None, None, None, None, None, None, None)
  maxOfMinFlank = 0 #this variable keeps track of the largest of the smaller (of the two) flank amongst the sequences.
  #The flanks are to the right and left of the Excised region. 

  molFrag = ""
  while True: #now go through the file itself
    try:
      ageLn = next(ageAln)
      if ageLn.startswith("First"):
        molFrag = next(ageAln).split()[-1] #molFrag is string containing moleculo fragment name.
      if ageLn.startswith("Alignment:"):
        bLn = next(ageAln)
        if bLn.find("EXCISED") != -1:
          lftOfExcsd = bLn[bLn.index("[")+1:bLn.index("]")].split(",")
          rhtOfExcsd = bLn[bLn.rindex("[")+1:bLn.rindex("]")].split(",")
          (uCurrBestSeq, currBestSeq, lCurrBestSeq, currBSeqStrt, currBSeqEnd, currBSeqExCoL, currBSeqExCoR) = findPrfSeq(lftOfExcsd, rhtOfExcsd)
          if currBestSeq is not None:
            bestSeqData = (molFrag, uCurrBestSeq, currBestSeq, lCurrBestSeq, currBSeqStrt, currBSeqEnd, currBSeqExCoL, currBSeqExCoR)
    except StopIteration:
      break #out of while loop
  
  ageAln.close()
  if bestSeqData[0] is not None:
    argForPrnt = bestSeqData + (chr,)
    prntBestSeq(*argForPrnt)

ageAlnFlLst.close()
prfSeqFile.close()
