#!/bin/env python

#This script writes out all the SNPs from the TruSeq fragments selected by the script snpProcAGEscrpt.
#Only heterozygous deletion events are considered.
#In this script three command line arguments are given.
#The first argument is a file containing the data. This input file contains all the sequences (from AGE alignment) that have perfectly aligned breakpoints. It is usually named as prfSeqs.txt. 
#The second argument is a text file that contains genotype information to determine if the deletions are heterozygous. 
#This information comes from cnvnator. But if we already know the gt information (i.e., heterozygosity) w/o using cnvnator, 
#we can just pass 0 for the argv[2] (second argument). Then this script will not look for an extra file.
#The third argument is the output VCF file. 
#A print statement at the end, prints to stdout, the number of events deemed heterozygous. For our current set, this number is 262. 

from __future__ import print_function
import sys
import numpy as np

#All the snps on a fragment are processed. There is no window restriction in this script. 
def prntSnpsVCF(seqLst, offSt, prvCnt, chr, strt, flnkLen, exCoR, lfl, rfl):
  x = seqLst[1].find(".")
  if (x != -1):
    (jnk, snp, rit) = seqLst[1].partition(".")
    y = x+offSt
    totCnt = seqLst[0][:x].count("-") + prvCnt
    if exCoR == 0:
      rel = -(y+1)
      snpCrd = strt+flnkLen-(y+1-totCnt)
    else:
      rel = y
      snpCrd = exCoR +(y-totCnt)

    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\tMF={7};RP={8};LFL={9};RFL={10};{11}".format(chr.strip("chr"), snpCrd, ".", seqLst[0][x], seqLst[2][x], ".", ".", molfrag.strip("'"), rel, lfl, rfl, gtFlg), file=vcfFle)
    pList = list((seqLst[0][x+1:], rit, seqLst[2][x+1:])) #p for whatever remains after the above partitioning.
    prntSnpsVCF(pList, y+1, totCnt, chr, strt, flnkLen, exCoR, lfl, rfl)

def procSnpsVCF(uStr, symStr, lStr, chr, strt, exCoL, exCoR, ends):
  (uLef, ex, uRit) = uStr.partition("EX")
  (lef, ex, rit) = symStr.partition("EX")
  (lLef, ex, lRit) = lStr.partition("EX")

  lefL = len(lef) #actual length of left fragment; this includes ---
  ritL = len(rit) #Remember to strip the "\n" at the end; actual length of right fragment; this includes ---
  lList = list((uLef[::-1], lef[::-1], lLef[::-1]))
  rList = list((uRit, rit, lRit))

  prntSnpsVCF(lList, 0, 0, chr, strt, exCoL-strt+1, 0, lefL, ritL)
  prntSnpsVCF(rList, 0, 0, chr, strt, ends-exCoR+1, exCoR, lefL, ritL)


def xtract(lnInfo1, lnInfo2):
  flds1 = lnInfo1.split()
  fragNm = flds1[0]
  strt = int(flds1[-1])
  flds2 = lnInfo2.split()
  chrom = flds2[2].rstrip(",")
  exCoL = int(flds2[5].strip("(,"))
  exCoR = int(flds2[6].strip(")"))
  ends = int(flds2[1].strip(","))
  gtStr = chrom+":"+str(exCoL)+"-"+str(exCoR-1)
  if sys.argv[2] == "0": #we are not using cnvnator. see comments below.
    gtFlg = "HET"
  else:
    GTfle = open(sys.argv[2]) #GT for genotype information; it's being obtained from file created using cnvnator.
    for line in GTfle:
      if gtStr in line:
        gtNum1 = float(line.split()[-2])
        gtNum2 = float(next(GTfle).split()[-2])
        gtNum3 = float(next(GTfle).split()[-2])
        break
    GTfle.close()
    #After the first condition, the next two "ands" check that a HET type deletion can happen from the parents.
    #See note may27.
    if 0.5 <= gtNum1 <= 1.5 and (0.5 <= gtNum2 or 0.5 <= gtNum3) and (gtNum2 <= 1.5 or gtNum3 <= 1.5):       
      gtFlg = "HET"
    else:
      gtFlg = "DC" #DC for Don't Care
   
  return (fragNm, chrom, strt, exCoL, exCoR, ends, gtFlg)


#The first argument is a file containing the data. This input file contains all the sequences (from AGE alignment) that have perfectly aligned breakpoints.
#Specify this file as say prfSeqs or some other name.
fhSeqs = open(sys.argv[1])
vcfFle = open(sys.argv[3], "a")

print("##fileformat=VCFv4.1", file=vcfFle)
print("##INFO=<ID=MF,Number=1,Type=String,Description=\"Moleculo fragment\">", file=vcfFle)
print("##INFO=<ID=RP,Number=1,Type=Integer,Description=\"Relative position\">", file=vcfFle)
print("##INFO=<ID=LFL,Number=1,Type=Integer,Description=\"Left fragment length\">", file=vcfFle)
print("##INFO=<ID=RFL,Number=1,Type=Integer,Description=\"Right fragment length\">", file=vcfFle)
print("##INFO=<ID=HET,Number=0,Type=Flag,Description=\"Heterozygous deletion\">", file=vcfFle)
print("##INFO=<ID=HOM,Number=0,Type=Flag,Description=\"Homozygous deletion\">", file=vcfFle)
print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", file=vcfFle)

hetCnt = 0
while True:
  try:
    #Read the lines in the prfSeqs file.  
    bInfo = next(fhSeqs) #b for begin
    uSeq = next(fhSeqs)
    sym = next(fhSeqs)
    lSeq = next(fhSeqs)
    enInfo = next(fhSeqs) #en for end
    (molfrag, chr, strtCrd, exCoL, exCoR, ends, gtFlg) = xtract(bInfo, enInfo)
    if gtFlg == "HET": #comment this line, if all SNPs should be printed in VCF file.
      hetCnt += 1
      procSnpsVCF(uSeq, sym, lSeq, chr, strtCrd, exCoL, exCoR, ends)
      
  except StopIteration:
    break #out of while loop

fhSeqs.close()
vcfFle.close()
print("het count ", hetCnt)
