#!/bin/env python

from __future__ import print_function
import sys
import numpy as np

#This script selects heterozygous SNPs from the intersection or complement VCF files created using, for example, 
#vcf-isec. 
#The script takes four arguments. The first is the intersection or complement file. The second is the 
#regions.txt file. The third is the window size. We found that a value of 10000 bp is sufficient to include
#almost all the flanking SNPs on TruSeq fragments. 
#The fourth argument is an output VCF file. The SNPs are written out in this VCF file.

#the intersection or complement VCF file.
intCompVCF = open(sys.argv[1])
vcfFle = open(sys.argv[4], "a")
WIN = int(sys.argv[3])

vcfln = next(intCompVCF)

while vcfln.startswith("#"):
  print(vcfln, file=vcfFle, end="")
  vcfln = next(intCompVCF)

snpCount = 0
while True:
  try:
    vcfFlds = vcfln.split()
    vcfChr = vcfFlds[0]
    vcfPos = int(vcfFlds[1])
    lftOrRit = 0 #0 means left
    for line in open(sys.argv[2]):
      flds = line.split(":")
      chr = flds[0]
      rng = flds[1].split("-")
      ll = int(rng[0])
      rl = int(rng[1])
      #check if current variant is in the right region.
      #This is important to determine if possible SNP is within the window.
      #Only collect those in the window. 
      if chr == vcfChr and (ll <= vcfPos <= rl):  
        gtflds = vcfFlds[9].split(":")[0]
	lgt = int(gtflds[0])
	rgt = int(gtflds[2])
        if lgt != rgt: #possible HET SNP
	  vcfREF = vcfFlds[3]
	  vcfALT = vcfFlds[4]
	  if lftOrRit == 0: #work with left region
	    if rl+1-vcfPos <= WIN and len(vcfREF) == 1: #if within window and possible SNP, continue processing.
	      prcss = 1
	    else:
	      prcss = 0
	  else: #work with right region 
	    if vcfPos-ll < WIN and len(vcfREF) == 1:
	      prcss = 1
	    else:
	      prcss = 0
	    
	  if prcss:
	    if len(vcfALT) == 1: #straightforward SNP; this script only tracks these SNPs. 
              snpCount += 1
              print(vcfln, file=vcfFle, end="")
        break #out of the for loop, as match was found		

      lftOrRit =  0 if lftOrRit else 1
    else: #for the for loop
      print("{0} {1} does not fit in any region.".format(vcfChr, vcfPos))

    vcfln = next(intCompVCF)  

  except StopIteration:
      break #out of while loop

print("snpCount is {0}".format(snpCount)) 
