#!/usr/bin/perl

###############################
#
# Copyright Stanford University 2016
# Author: John Bell
#
# This is a script to simplify the genotype information of a 
# single-sample 10X-sequenced vcf file to GT:BX:PS
# (genotype -- barcodes -- phase id)
# 
# It does not print any header information.  It does print all data lines.
#
# NB:  WE REQUIRE NUMERICAL CHROMOSOMES, NCBI STYLE + X=23, Y=24, FOR SORTING
#
# Input:  a 10X-style phased vcf file, that includes GT, BX 
# and PS fields in the FORMAT field (among others possibly)
# Output:  a simplified vcf file, with only GT:BX:PS genotype fields
#
################################

use strict;

my $usage = "usage: $0 <phased vcf file>  >>creates simplified vcf file, with only GT:BX:PS genotype field -- use only on phased data";

my $ct = 0;
while (<>) {
 
    if (/\A#/) { $ct++; next; }
    my @b1 = split ' ', $_;

# we require numerical chromosomes to simplify testing
      if ($b1[0] =~ /\A\D/) { die "need numerical chromosomes!!!  $ct  $b1[1]  \n"; }

# find positions in FORMAT for BX and PS, while checking for GT, then print simplified genotype info
	if ($b1[8] ne "GT:BX:PS") { 
	   my $bx1 = my $ps1 = "";
	   my @gt1_info_pieces = split ':', $b1[8];
           unless ($gt1_info_pieces[0] eq "GT") { my $probpos = $ct+1; die "GT not first field!!! in position $probpos  $b1[0] $b1[1]\n"; }
	   my @gt1_pieces = split ':', $b1[9];
	   my $bx1_pos = 0;
	   while ($gt1_info_pieces[$bx1_pos]) {
		if ($gt1_info_pieces[$bx1_pos] eq "BX") {
		  $bx1 = $gt1_pieces[$bx1_pos]; 
		} elsif ($gt1_info_pieces[$bx1_pos] eq "PS") {
		  $ps1 = $gt1_pieces[$bx1_pos]; 
	        }
		if ($bx1 && $ps1) { last; }
		$bx1_pos++;
	   }
	   my $gt1 = $gt1_pieces[0] . ":" . $bx1 . ":" . $ps1;
	   my $newline = "$b1[0]\t$b1[1]\t$b1[2]\t$b1[3]\t$b1[4]\t$b1[5]\t$b1[6]\t$b1[7]\tGT:BX:PS\t$gt1\n";
 	   print $newline;
	} else {
# already in simplified format, so print
		print;
	}
# increment and prepare for next comparison
	$ct++;
}
