#!/bin/env python

#This script reads from a VCF file and outputs the variant count in a numpy array.
#The first argument is the regions files. The second argument is the in-phase or out-of-phase VCF file. 
#The third argument is the WIN size. This should correspond to the same WIN size used to create the
#regions. The output array will have InPhs or OutPhs as part of its name. 

from __future__ import print_function
import sys
import os
import numpy as np

snpCount = 0 #snps that pass all tests below

vcfFleNam = sys.argv[2]
WIN = int(sys.argv[3])
phsSnpCntArr = np.zeros(2*WIN)

rpts = []
lftOrRit = 0#0 means left
for line in open(sys.argv[1]):
  flds = line.strip().split()
  rnge = flds[0].split(":")[1].split("-")
  ll = int(rnge[0])
  rl = int(rnge[1])
  cmd = " ".join(("tabix", vcfFleNam, flds[0], "> tmpFle"))
  os.system(cmd)
  for line1 in open("tmpFle"):
    dntCnt = 0
    vcfFlds = line1.strip().split()
    vcfChr = vcfFlds[0]
    vcfPos = int(vcfFlds[1])
    snpCoord = vcfChr+" "+vcfFlds[1]  #building a simple str of snp coord. to hold in this list.
    if snpCoord in rpts:              #we have to check for repeats to keep them out.
      dntCnt = 1
    else:
      rpts.append(snpCoord)
    if (ll <= vcfPos <= rl) and (not dntCnt):
      snpCount += 1
      if lftOrRit == 0:
        z = WIN + vcfPos-rl-1
      else:
        z = WIN+vcfPos-ll
      phsSnpCntArr[z] += 1 

  lftOrRit =  0 if lftOrRit else 1        

if "inPhs" in vcfFleNam:
  sampel = vcfFleNam.split("in")[0]
  arrNm = sampel +"inPhsGATKsnpCnt"
else:
  sampel = vcfFleNam.split("out")[0]
  arrNm = sampel +"outPhsGATKsnpCnt"

np.save(arrNm, phsSnpCntArr)      

print("snpCount is {0}".format(snpCount)) 
