#!/bin/env python

from __future__ import print_function
import sys
import os
import gzip

#This script phases the GATK call set variants. This particular script is for NA12878.

vcfFle = gzip.open(sys.argv[1])

opVcfFle = open(sys.argv[2], "a")
notPhsFle = open(sys.argv[3], "a") #those that cannot be phased
inConsGTFle = open(sys.argv[4], "a") #those that have inconsistent GT 

print("##fileformat=VCFv4.1", file=opVcfFle)
print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', file=opVcfFle)
print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878\tNA12891\tNA12892", file=opVcfFle)

vcfln = next(vcfFle)
while vcfln.startswith("#"):
  vcfln = next(vcfFle)

while True:
  try:
    vcfFlds = vcfln.split()
    gtflds = vcfFlds[9].split(":")[0]
    gtflds1 = vcfFlds[10].split(":")[0]
    gtflds2 = vcfFlds[11].split(":")[0]
    if gtflds[0] != gtflds[2]:
      if (gtflds[0] in gtflds1) and (gtflds[0] not in gtflds2):
        if (gtflds[2] in gtflds2):
          phsGT = gtflds[0]+"|"+gtflds[2]
          print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}".format(vcfFlds[0], vcfFlds[1],".", vcfFlds[3], vcfFlds[4], ".", vcfFlds[6], ".", "GT", phsGT, gtflds1, gtflds2), file=opVcfFle) 
        else:
          print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}".format(vcfFlds[0], vcfFlds[1],".", vcfFlds[3], vcfFlds[4], ".", vcfFlds[6], ".", "GT", gtflds, gtflds1, gtflds2), file=inConsGTFle) 
      
      elif (gtflds[0] not in gtflds1) and (gtflds[0] in gtflds2):
        if (gtflds[2] in gtflds1):
          phsGT = gtflds[2]+"|"+gtflds[0]
          print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}".format(vcfFlds[0], vcfFlds[1],".", vcfFlds[3], vcfFlds[4], ".", vcfFlds[6], ".", "GT", phsGT, gtflds1, gtflds2), file=opVcfFle) 
        else:
          print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}".format(vcfFlds[0], vcfFlds[1],".", vcfFlds[3], vcfFlds[4], ".", vcfFlds[6], ".", "GT", gtflds, gtflds1, gtflds2), file=inConsGTFle) 

      elif (gtflds[2] in gtflds1) and (gtflds[2] not in gtflds2):
        if (gtflds[0] in gtflds2):
          phsGT = gtflds[2]+"|"+gtflds[0]
          print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}".format(vcfFlds[0], vcfFlds[1],".", vcfFlds[3], vcfFlds[4], ".", vcfFlds[6], ".", "GT", phsGT, gtflds1, gtflds2), file=opVcfFle) 
        else:
          print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}".format(vcfFlds[0], vcfFlds[1],".", vcfFlds[3], vcfFlds[4], ".", vcfFlds[6], ".", "GT", gtflds, gtflds1, gtflds2), file=inConsGTFle) 
       
      elif (gtflds[2] not in gtflds1) and (gtflds[2] in gtflds2):
        if (gtflds[0] in gtflds1):
          phsGT = gtflds[0]+"|"+gtflds[2]
          print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}".format(vcfFlds[0], vcfFlds[1],".", vcfFlds[3], vcfFlds[4], ".", vcfFlds[6], ".", "GT", phsGT, gtflds1, gtflds2), file=opVcfFle) 
        else:
          print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}".format(vcfFlds[0], vcfFlds[1],".", vcfFlds[3], vcfFlds[4], ".", vcfFlds[6], ".", "GT", gtflds, gtflds1, gtflds2), file=inConsGTFle) 
      elif (gtflds[0] in gtflds1) and (gtflds[0] in gtflds2) and (gtflds[2] in gtflds1) and (gtflds[2] in gtflds2):
        print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}".format(vcfFlds[0], vcfFlds[1],".", vcfFlds[3], vcfFlds[4], ".", vcfFlds[6], ".", "GT", gtflds, gtflds1, gtflds2), file=notPhsFle) 
      else:
        print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}".format(vcfFlds[0], vcfFlds[1],".", vcfFlds[3], vcfFlds[4], ".", vcfFlds[6], ".", "GT", gtflds, gtflds1, gtflds2), file=inConsGTFle) 

    vcfln = next(vcfFle)
  except StopIteration:
      break #out of while loop
