#!/bin/env python

#This script selects heterozygous SNPs from the intersection or complement VCF files created using, for example, 
#vcf-isec. 
#The script takes three arguments. The first is the intersection or complement file. The second is the 
#regions.txt file. The third is the window size. We found that a value of 10000 bp is sufficient to include
#almost all the flanking SNPs on TruSeq fragments. 
#The SNPs are written out numpy arrays.  

from __future__ import print_function
import sys
import numpy as np

#the complement or intersection VCF file.
intCompVCF = open(sys.argv[1])
WIN = int(sys.argv[3])

snpCntArr = np.zeros(2*WIN)
seqEvntCntArr = np.zeros(2*WIN)

vcfln = next(intCompVCF)

while vcfln.startswith("#"):
  vcfln = next(intCompVCF)

lftOrRit = 0 #0 means left
for line in open(sys.argv[2]):
  flds = line.split(":")
  rng = flds[1].split("-")
  ll = int(rng[0])
  rl = int(rng[1])
  frgLen = rl-ll+1
  if frgLen >= WIN:
    if lftOrRit == 0:
      seqEvntCntArr[:WIN] += 1
    else:
      seqEvntCntArr[WIN:] += 1
  else:
    if lftOrRit == 0:
      seqEvntCntArr[WIN-1:WIN-1-frgLen:-1] += 1
    else:
      seqEvntCntArr[WIN:WIN+frgLen]+= 1
  lftOrRit =  0 if lftOrRit else 1


snpCount = 0
while True:
  try:
    vcfFlds = vcfln.split()
    vcfChr = vcfFlds[0]
    vcfPos = int(vcfFlds[1])
    lftOrRit = 0 #0 means left
    for line in open(sys.argv[2]):
      flds = line.split(":")
      chr = flds[0]
      rng = flds[1].split("-")
      ll = int(rng[0])
      rl = int(rng[1])
      #check if current vcf file line is in the right region.
      #This is important to determine relative (to del brkpnts) positions.
      if chr == vcfChr and (ll <= vcfPos <= rl):  
        gtflds = vcfFlds[9].split(":")[0]
	lgt = int(gtflds[0])
	rgt = int(gtflds[2])
        if lgt != rgt: #possible HET SNP
	  vcfREF = vcfFlds[3]
	  vcfALT = vcfFlds[4]
	  if lftOrRit == 0: #work with left region
	    if rl+1-vcfPos <= WIN and len(vcfREF) == 1: #if within window and possible SNP, continue processing.
	      prcss = 1
	    else:
	      prcss = 0
	  else: #work with right region 
	    if vcfPos-ll < WIN and len(vcfREF) == 1:
	      prcss = 1
	    else:
	      prcss = 0
	    
          if prcss:
	    if len(vcfALT) == 1: #straightforward SNP; 
              snpCount += 1
	      if lftOrRit == 0: #left SNPs
		z = WIN + vcfPos-rl-1 #note that vcfPos-rl is <0 automatically
		snpCntArr[z] += 1
	      else: #right SNPs
		z = WIN+vcfPos-ll
		snpCntArr[z] += 1
        break #out of the for loop, as match was found		

      lftOrRit =  0 if lftOrRit else 1
    else: #for the for loop
      print("{0} {1} does not fit in any region.".format(vcfChr, vcfPos))

    vcfln = next(intCompVCF)  

  except StopIteration:
      break #out of while loop

print("snpCount is {0}".format(snpCount)) 

#saving the arrays so the plot script can load them up
#Check if such files already exist and delete or mv them if necessary
np.save('snpCntVCFArr', snpCntArr)
np.save('seqEvntCntVCFArr', seqEvntCntArr)
