#!/usr/bin/perl -w
#########################################################
# Author:	Nancy Hansen
# Date:		4/1/2009
# Function:	MPG "most probable genotype"
#               Program to predict the most likely genotype
#               at multiple positions using Bayesian
#               methods on aligned short read data.
#               bam2mpg gives mpg results from a BAM
#               file.
##########################################################

use strict;
use Getopt::Long;
use Carp;

use Bio::SeqIO;

use GTB::Var::SiteProfile;
use GTB::Var::Polymorphism;

use vars qw($VERSION);

$| = 1; # let's see output right away!

my $Id = q$Id: bam2mpg 3561 2010-03-26 17:05:05Z nhansen $;
$VERSION = sprintf "%.4f", substr(q$Rev: 3561 $, 4)/10000;

my $samtools_exe = '/usr/local/gtb/vendor/bin/samtools';

if (!-x $samtools_exe)
{
    croak "Samtools executable $samtools_exe is not executable!\n";
}

my $opt_coverage; # option to write the coverage to a final field --disabled 1/12/10, will remove soon
my $min_ds_coverage = 0; # minimum number of times an allele must be seen on each strand to be included
my $align_bias = 0; # option to tell program that reference bases are more likely to align than non-ref
                    # if 55% of bases aligning from a heterozygote are reference, then align bias is .05
my $qual_filter = 0; # if set to non-zero, will exclude bases with lower than the specified quality
my $qual_offset = 33; 
my $opt_pooling; # option to look for non-ref alleles in pools
my $verbose = 0; # print out a lot!
my $no_rands = 200; # use a maximum of this many reads' coverage (after quality and strand filtering)

my $prob_error = 0.02;
my $min_diff = 3; # this corresponds to the prior probability of observing a non-ref genotype.
my $indels_only; # option to skip SNP calling and call only indels
my $no_indels; # option to skip indel calling, saving memory for storing read coordinates.
my $score_variant; # option to score a variant by comparing to homozyg. ref. rather than next most probable genotype
my $region; # option to specify only a particular region in the BAM file (in format "1:1000-2000")
my $only_nonref; # option to report only differences from reference
my $single_copy; # option to specify a region in the BAM file that is single-copy (e.g., the X or Y chromosome)
my $nocache; # option to disallow caching of genotypes for unique counts of each base (regardless of quality)

my $Usage = "Usage: bam2mpg <--qual_filter quality> <--coverage> <--only_nonref> <--align_bias bias> <--ds_coverage # of bases required on each strand> <--indels> <--no_indels> <--nocache> <--region chr1:1000-2000> <--single_copy chr:start-end> <reference fasta> <bam file>\n";

GetOptions("coverage" => \$opt_coverage, "only_nonref" => \$only_nonref, "align_bias=f" => \$align_bias, "ds_coverage=i" => \$min_ds_coverage, "qual_filter=i" => \$qual_filter, "verbose" => \$verbose, "indels" => \$indels_only, "pooling" => \$opt_pooling, "region=s" => \$region, "no_indels" => \$no_indels, "score_variant" => \$score_variant, "single_copy=s" => \$single_copy, "nocache" => \$nocache);

($#ARGV == 1)
  or die "$Usage";

my $ref_fasta = $ARGV[0];
my $bam_file = $ARGV[1];
my %covHash; # for storing previously-calculated MPG results

my $rh_ref_seqs = read_fasta($ref_fasta);
my $rh_ref_lengths = {};
foreach my $ref (keys %{$rh_ref_seqs}) {
    $rh_ref_lengths->{$ref} = length $rh_ref_seqs->{$ref};
}

my $rh_sc_regions = parse_single_copy_regions($single_copy, $rh_ref_lengths) if ($single_copy);

print "Assuming uniform error rate of $prob_error and prior log diff of $min_diff.\n";
print "Filtering bases that are less than quality $qual_filter.\n" if ($qual_filter);
print "Filtering bases that don\'t have minimum coverage on each strand of $min_ds_coverage.\n" if ($min_ds_coverage);
print "Assuming an alignment bias of $align_bias.\n" if ($align_bias);
print "Single-copy regions: $single_copy.\n" if ($single_copy);

# read in coverage in pileup format:

my $pileup_pipe = ($region) ? "$samtools_exe view -b $bam_file $region | $samtools_exe pileup -f $ref_fasta - | " : "$samtools_exe pileup -f $ref_fasta $bam_file |";

print "Calling \'$pileup_pipe\'\n" if ($verbose);

open PILEUP, $pileup_pipe
    or croak "Couldn\'t run $samtools_exe on $bam_file\n";

my $rh_read_pointers = {}; # will contain pointers to align coords of current reads
my $rh_align_coords = {}; # will contain alignment coordinates for each read (used in indel calling)
my $rh_indel_coverage = {}; # will hold coverage objects for indels

my $last_ref;
while (<PILEUP>)
{
    if (/^(\S+)\s(\d+)\s([ATGCatgcNn])\s(\d+)\s(\S+)\s(\S+)/)
    {
        my ($ref_name, $ref_pos, $ref_base, $no_bases, $cov_string, $qual_string) = ($1, $2, $3, $4, $5, $6);
        #last if ($last_ref && $last_ref ne $ref_name);

        my $sc_here = ($single_copy && exists($rh_sc_regions->{$ref_name}) &&
                        (($rh_sc_regions->{$ref_name} eq '') ||
                         (substr($rh_sc_regions->{$ref_name}, $ref_pos-1, 1)
                             eq '1'))) ? 1 : 0;
       
        $last_ref = $ref_name;
        print "$ref_name\t$ref_pos\t$ref_base\n" if ($verbose);
        my $uc_ref_base = uc $ref_base;

        my $ra_coverage = []; # will contain bases and qualities seen
        $rh_indel_coverage->{$ref_name} = {} if (!defined ($rh_indel_coverage->{$ref_name}));
        $rh_align_coords->{$ref_name} = [] if (!defined ($rh_align_coords->{$ref_name}));
        $rh_read_pointers->{$ref_name} = [] if (!defined ($rh_read_pointers->{$ref_name}));

        split_coverage($ra_coverage, $rh_read_pointers->{$ref_name}, $rh_align_coords->{$ref_name}, $rh_indel_coverage->{$ref_name}, $cov_string, $qual_string, $ref_pos, \$rh_ref_seqs->{$ref_name}, $qual_offset);

        next if ($indels_only);

        if ($qual_filter) # remove bases with low quality:
        {
            filter_for_qual($ra_coverage, $qual_filter);
        }

        filter_for_ds_coverage($ra_coverage, $min_ds_coverage) if ($min_ds_coverage);

        foreach my $rh_base (@{$ra_coverage})
        {
            my $base = $rh_base->{'base'};
            my $qual = $rh_base->{'qual'};

            print "\t$base\t$qual\n" if ($verbose);
        }

        if ((!$opt_pooling) && ($no_bases > $no_rands)) {
            take_random_subset_of_coverage($ra_coverage, $no_rands) unless $opt_pooling;
        }

        my @ra_cs = ();
        foreach my $rh_base (@{$ra_coverage})
        {
            push @ra_cs, $rh_base->{'base'};
        }
        my $ra_cs = join("", (sort @ra_cs));
        if (!$sc_here and !$nocache and exists($covHash{$ref_base}{$ra_cs}) and !$opt_pooling) {
            #next if $covHash{$ref_base}{$ra_cs} =~ /0$/;
            print "MPG_SNV\t$ref_name\t$ref_pos\t$ref_base";
            print $covHash{$ref_base}{$ra_cs};
            print "\n";
        } else {

            my $site_profile = GTB::Var::SiteProfile->new(
                              -reference_base => $uc_ref_base,
                              -coverage => $ra_coverage,
                              -default_error => $prob_error,
                              -single_copy => $sc_here);

            my $coverage = @{$ra_coverage};
            # pooling option:
            if ($opt_pooling)
            {
                foreach my $nr_allele (@{$site_profile->minor_allele_proportion()})
                {
                    my $minor_allele = $nr_allele->{'minor_allele'};
                    my $minor_observed = $nr_allele->{'minor_observed'};
                    my $total_observed = $nr_allele->{'total_observed'};
                    my $minor_forward = $nr_allele->{'minor_forward'};
                    my $total_forward = $nr_allele->{'total_forward'};
                    my $minor_reverse = $nr_allele->{'minor_reverse'};
                    my $total_reverse = $nr_allele->{'total_reverse'};
                    my $proportion = $nr_allele->{'proportion'};
                    my $error = $nr_allele->{'error'};
                    print "MAP\t$ref_name\t$ref_pos\t$ref_base\t$minor_allele\t$minor_observed/$total_observed\t$minor_forward/$total_forward\t$minor_reverse/$total_reverse\t$proportion\t$error\n";
                }
                next;
            }
    
            my $rh_log_likelihoods = $site_profile->log_likelihoods(
                                -prior_diff => $min_diff,
                                -alignment_bias => $align_bias);
    
            my @sorted_genotypes = sort {$rh_log_likelihoods->{$b} <=>
                                    $rh_log_likelihoods->{$a}} 
                                         keys %{$rh_log_likelihoods};
    
            my $mpg = $sorted_genotypes[0];
            # compare to homozygous ref if -score_variant option was used
            my $nmpg = ($score_variant) ? $uc_ref_base.$uc_ref_base : 
                                          $sorted_genotypes[1];
            my $score = int($rh_log_likelihoods->{$mpg} - $rh_log_likelihoods->{$nmpg});
    
            if ($mpg ne $uc_ref_base.$uc_ref_base)
            {
                print "Nonref genotype!!!\n" if ($verbose);
            }
            my $nr_string = (($sc_here && $mpg eq $uc_ref_base) || 
                             ($mpg eq "$uc_ref_base$uc_ref_base")) ? 0 : 1;
            next if ($nr_string == 0 and $only_nonref);
            $nr_string .= "\t$coverage"; # kludgy!
            $covHash{$ref_base}{$ra_cs} = "\t$mpg\t$score\t$nr_string" unless ($sc_here || $nocache);
            print "MPG_SNV\t$ref_name\t$ref_pos\t$ref_base\t$mpg\t$score\t$nr_string";
            print "\n";
        }
    }
}

close PILEUP;

exit if ($opt_pooling);

unless ($no_indels)
{
    foreach my $ref (keys %{$rh_align_coords})
    {
        process_indels($ref, $rh_indel_coverage->{$ref}, $rh_align_coords->{$ref}, \$rh_ref_seqs->{$ref}, $qual_filter);
    }
}

sub read_fasta
{
    my $fasta_file = shift;

    my $rh_seqs = {};
    my $seqio = Bio::SeqIO->new(-file => $fasta_file,
                                -format => 'Fasta');
    while (my $seq = $seqio->next_seq())
    {
        my $display_id = $seq->display_id();
        $rh_seqs->{$display_id} = $seq->seq();
    }

    return $rh_seqs;
}

sub split_coverage
{
    my $ra_coverage = shift;
    my $ra_ref_read_pointers = shift;
    my $ra_align_coords = shift;
    my $rh_indel_coverage = shift;
    my $cov_string = shift;
    my $qual_string = shift;
    my $ref_pos = shift;
    my $ref_seq = shift;
    my $qual_offset = shift;

    my $ref_base = substr($$ref_seq, $ref_pos - 1, 1);
    my $ref_end = length $$ref_seq;

    my $rev_cov = reverse $cov_string;
    my $rev_qual = reverse $qual_string;

    my $read_no = 0;
    my $del_no = 0; # count number of deleted bases seen
    my $qual = 0;
    while (my $base = chop $rev_cov)
    {
        if ($base =~ /[.,ATGCNatgcn*]/) # just a normal base
        {
            if ($base ne '*')
            {
                $qual = chop $rev_qual;
                $qual = ord($qual) - $qual_offset; # convert quality char to score
                $base = uc $ref_base if ($base eq '.'); # reference base (forward)
                $base = lc $ref_base if ($base eq ','); # reference base (reverse)
    
                push @{$ra_coverage}, {'base' => $base, 'qual' => $qual};
            }

            $read_no++; # record what read number we're on (even if it's a pad)
        }
        elsif (($base eq '^') && (!$no_indels)) # beginning of new alignment
        {
            my $align_score = chop $rev_cov;
            $align_score = ord($align_score) - 33; # samtools uses offset 33

            push @{$ra_align_coords}, {'start' => $ref_pos, 'score' => $align_score};
            $ra_ref_read_pointers->[$read_no + 1] = $ra_align_coords->[$#{$ra_align_coords}];
        }
        elsif (($base eq '$') && (!$no_indels)) # end of read
        {
            $ra_ref_read_pointers->[$read_no]->{'end'} = $ref_pos;
            my $this_read_start = $ra_ref_read_pointers->[$read_no]->{'start'};
            my $this_read_score = $ra_ref_read_pointers->[$read_no]->{'score'};
            #print "From $this_read_start-$ref_pos, score $this_read_score\n";
             
            splice(@{$ra_ref_read_pointers}, $read_no, 1); # remove this read from current list
            $read_no--;
        }
        elsif ($base eq '+') # insertion wrt reference
        {
            my $no_bases = 0;
            my $digit = chop $rev_cov;
            while ($digit =~ /\d/)
            {
                $no_bases *= 10;
                $no_bases += $digit;
                $digit = chop $rev_cov;
            }
            my $inserted_bases = $digit;
            my $this_base_no = 1;
            while ($this_base_no < $no_bases)
            {
                $inserted_bases .= chop $rev_cov;
                $this_base_no++;
            }

            my $strand = ($inserted_bases =~ /atgc/) ? 'C' : 'U';

            $inserted_bases = uc $inserted_bases;
            my $flank_bases_to_include = 1000;
            my $left_flank_end = $ref_pos;
            my $right_flank_start = $ref_pos + 1;

            my $left_flank_seq = substr($$ref_seq, $left_flank_end-$flank_bases_to_include, $flank_bases_to_include);
            my $right_flank_seq = substr($$ref_seq, $right_flank_start - 1, $flank_bases_to_include);

            $left_flank_seq = uc $left_flank_seq;
            $right_flank_seq = uc $right_flank_seq;

            my $ref_allele = '';

            my $new_allele = $inserted_bases;
            my $poly = GTB::Var::Polymorphism->new(
                                 -left_flank_end => $left_flank_end,
                                 -right_flank_start => $right_flank_start,
                                 -left_flank_seq => $left_flank_seq,
                                 -right_flank_seq => $right_flank_seq,
                                 -type => 'Insertion',
                                 -allele_seqs => [$ref_allele, $new_allele] );

            print "Original: $left_flank_end-$right_flank_start ($ref_allele/$new_allele) ($left_flank_seq/$right_flank_seq)\n" if ($verbose);

            my $new_left_flank_end = $poly->left_flank_end();
            my $new_right_flank_start = $poly->right_flank_start();
            my $new_ra_alleles = $poly->allele_seqs();
            $new_allele = $new_ra_alleles->[1];
            print "Adjusted: $new_left_flank_end-$new_right_flank_start\n" if ($verbose);
            if (!defined ($new_right_flank_start))
            {
                croak "Indel expanded has no right flank start!\n";
            }
            push @{$rh_indel_coverage->{$new_left_flank_end}->
                        {$new_right_flank_start}}, {
                                  'allele' => $new_allele,
                                  'qual' => $qual,
                                  'type' => 'Insertion',
                                  'strand' => $strand};

        }
        elsif ($base eq '-') # deletion wrt reference
        {
            my $no_bases = 0;
            my $digit = chop $rev_cov;
            while ($digit =~ /\d/)
            {
                $no_bases *= 10;
                $no_bases += $digit;
                $digit = chop $rev_cov;
            }
            my $deleted_bases = $digit;
            my $this_base_no = 1;
            while ($this_base_no < $no_bases)
            {
                $deleted_bases .= chop $rev_cov;
                $this_base_no++;
            }

            my $strand = ($deleted_bases =~ /atgc/) ? 'C' : 'U';
            $deleted_bases = uc $deleted_bases;
            my $del_size = length $deleted_bases;
            my $left_flank_end = $ref_pos;
            my $right_flank_start = $ref_pos + $del_size + 1;

            my $flank_bases_to_include = 1000;

            my $left_flank_seq = substr($$ref_seq, $left_flank_end - $flank_bases_to_include, $flank_bases_to_include);
            my $right_flank_seq = substr($$ref_seq, $right_flank_start - 1, $flank_bases_to_include);

            $left_flank_seq = uc $left_flank_seq;
            $right_flank_seq = uc $right_flank_seq;

            my $ref_allele = $deleted_bases;

            my $new_allele = '';
            my $poly = GTB::Var::Polymorphism->new(
                                 -left_flank_end => $left_flank_end,
                                 -right_flank_start => $right_flank_start,
                                 -left_flank_seq => $left_flank_seq,
                                 -right_flank_seq => $right_flank_seq,
                                 -type => 'Deletion',
                                 -allele_seqs => [$ref_allele, $new_allele] );

            print "Original: $left_flank_end-$right_flank_start ($ref_allele/$new_allele) ($left_flank_seq/$right_flank_seq)\n" if ($verbose);

            my $new_left_flank_end = $poly->left_flank_end();
            my $new_right_flank_start = $poly->right_flank_start();
            my $new_ra_alleles = $poly->allele_seqs();
            $new_allele = $new_ra_alleles->[1];
            print "Adjusted: $new_left_flank_end-$new_right_flank_start\n" if ($verbose);
            push @{$rh_indel_coverage->{$new_left_flank_end}->
                            {$new_right_flank_start}}, {
                                  'allele' => $new_allele,
                                  'qual' => $qual,
                                  'type' => 'Deletion',
                                  'strand' => $strand};
        }
    }
    #print "$ref_pos\t$read_no reads\n";
}

sub parse_single_copy_regions
{
    my $sc_string = shift;
    my $rh_ref_lengths = shift;
    my $rh_single_copy = {};

    my @sc_regions = split /,/, $sc_string;

    foreach my $region (@sc_regions) {
        if ($region !~ /:/) {
            $rh_single_copy->{$region} = '';
        }
        elsif ($region =~ /^([^:]+):(\d+)\-(\d+)$/) {
            my $ref = $1;
            my ($start, $end) = ($2, $3);
            my $length = $end - $start + 1;
            if ($rh_single_copy->{$ref}) {
                my $one_string = make_long_string('1', $length);
                substr($rh_single_copy->{$ref}, $start - 1, $length, $one_string);
            }
            else {
                my $ref_length = $rh_ref_lengths->{$ref};
                $rh_single_copy->{$ref} = make_long_string('2', $ref_length);
                my $one_string = make_long_string('1', $length);
                my $offset = $start - 1;
                print "Reference length $ref_length, start at $start - 1, length $length\n";
                substr($rh_single_copy->{$ref}, $offset, $length, $one_string);
            }
        }
    }

    return $rh_single_copy;
}

sub make_long_string {
    my $char = shift;
    my $desired_length = shift;

    my $power_of_2 = int(log($desired_length)/log(2));
    my $remainder = $desired_length - 2**$power_of_2;

    my $long_string = $char;
    for (my $i=1; $i<=$power_of_2; $i++) {
        $long_string = $long_string.$long_string;
    }

    for (my $j=0; $j<$remainder; $j++) {
        $long_string .= $char;
    }

    return $long_string;
}

sub filter_for_qual
{
    my $ra_coverage = shift;
    my $min_qual = shift;

    my @new_coverage = ();
    foreach my $rh_base (@{$ra_coverage})
    {
        if ($rh_base->{'qual'} >= $min_qual)
        {
            push @new_coverage, $rh_base;
        }
    }

    @{$ra_coverage} = @new_coverage;
}

sub filter_for_ds_coverage
{
    my $ra_coverage = shift;
    my $min_ds_coverage = shift;

    my @new_coverage = ();

    foreach my $for_base qw(A T G C)
    {
        my $rev_base = lc $for_base;
        my @for_bases = grep {$_->{'base'} eq $for_base} @{$ra_coverage};
        my $no_for = @for_bases;
        my @rev_bases = grep {$_->{'base'} eq $rev_base} @{$ra_coverage};
        my $no_rev = @rev_bases;
        next if ($no_for < $min_ds_coverage || $no_rev < $min_ds_coverage);
        push @new_coverage, @for_bases;
        push @new_coverage, @rev_bases;
    }

    @{$ra_coverage} = @new_coverage;
}

sub take_random_subset_of_coverage
{
    my $ra_coverage = shift;
    my $no_rands = shift;

    my @set = @{$ra_coverage}; # make a copy

    my $k = @set;
    return if ($k <= $no_rands);

    @{$ra_coverage} = ();
    while (--$no_rands >= 0) {
        my $r = int(rand($k));
        push @{$ra_coverage}, $set[$r];
        $set[$r] = $set[--$k];
    }
}

sub process_indels
{
    my $ref_name = shift;
    my $rh_indel_coverage = shift; # indel coverage arrays by position
    my $ra_align_coords = shift;
    my $ref_seq = shift;
    my $min_qual = shift;

    my $min_ac_index = 0; # this will be pushed up as we progress through the reference
    my $ra_max_end = []; # record maximum read end seen up to each index
    foreach my $left_flank_end (sort {$a <=> $b} keys %{$rh_indel_coverage})
    {
        my $sc_here = ($single_copy && exists($rh_sc_regions->{$ref_name}) &&
                        (($rh_sc_regions->{$ref_name} eq '') ||
                         (substr($rh_sc_regions->{$ref_name}, $left_flank_end-1, 1)
                             eq '1'))) ? 1 : 0;
       
        foreach my $right_flank_start (sort {$a <=> $b} keys %{$rh_indel_coverage->{$left_flank_end}})
        {
            my $ref_allele = substr($$ref_seq, $left_flank_end, 
                         $right_flank_start - $left_flank_end - 1);
            $ref_allele = uc $ref_allele;
            $ref_allele = '*' if (!$ref_allele);

            my %all_alleles = ();
   
            my $ra_coverage = [];

            my $reads_seen = 0; # count so that we know all remaining aligned must be reference
            foreach my $rh_cov (@{$rh_indel_coverage->{$left_flank_end}->{$right_flank_start}})
            {
                $reads_seen++;
                my $allele = $rh_cov->{'allele'} || '*';
                my $qual = $rh_cov->{'qual'};
                next if (($qual_filter) && ($qual < $qual_filter));
                my $strand = $rh_cov->{'strand'};
                $all_alleles{$allele} = 1;

                $allele = lc $allele if ($strand eq 'C');

                push @{$ra_coverage}, {'base' => $allele };
            }

            my @other_alleles = keys %all_alleles;
            next if (!@other_alleles); # all filtered out for quality

            # count number of reads that stretch across this putative indel's flanking ends: 
            my $no_reads = 0;
            for (my $ac_index = $min_ac_index; $ra_align_coords->[$ac_index] &&
                   $ra_align_coords->[$ac_index]->{'start'} <= $left_flank_end;
                   $ac_index++)
            {
                my $ac_end = $ra_align_coords->[$ac_index]->{'end'};
                $no_reads++ if ($ac_end >= $right_flank_start);
                if (!defined ($ra_max_end->[$ac_index]))
                {
                    $ra_max_end->[$ac_index] = ($ac_index == 0) ? $ac_end :
                                               ($ra_max_end->[$ac_index-1] > $ac_end) ? $ra_max_end->[$ac_index-1] : $ac_end;
                }
                # move min_ac_index up if we can
                if ($ra_max_end->[$ac_index] < $left_flank_end)
                {
                    $min_ac_index = $ac_index;
                }
            }

            my $no_ref_alleles = $no_reads - $reads_seen;

            for (my $i=0; $i<$no_ref_alleles; $i++)
            {
                push @{$ra_coverage}, {'base' => $ref_allele};
            }

            take_random_subset_of_coverage($ra_coverage, $no_rands);

            my $ra_all_alleles = [$ref_allele];
            push @{$ra_all_alleles}, keys %all_alleles;

            my $site_profile = GTB::Var::SiteProfile->new(
                                  -reference_base => $ref_allele,
                                  -coverage => $ra_coverage,
                                  -default_error => $prob_error,
                                  -allele_seqs => $ra_all_alleles,
                                  -single_copy => $sc_here);
            my $rh_log_likelihoods = $site_profile->log_likelihoods(
                                -prior_diff => $min_diff,
                                -alignment_bias => $align_bias);
    
            my @sorted_genotypes = sort {$rh_log_likelihoods->{$b} <=>
                                    $rh_log_likelihoods->{$a}} 
                                         keys %{$rh_log_likelihoods};
    
            my $mpg = $sorted_genotypes[0];
            my $nmpg = ($score_variant) ? "$ref_allele:$ref_allele" :
                                          $sorted_genotypes[1];
            my $score = int($rh_log_likelihoods->{$mpg} - $rh_log_likelihoods->{$nmpg});

            $mpg = '*'.$mpg if ($mpg =~ /^:/);
            $mpg = $mpg.'*' if ($mpg =~ /:$/);

            $ref_allele = uc $ref_allele || '*';
            my $nr_string = ($single_copy && $mpg eq $ref_allele) || 
                            ($mpg eq "$ref_allele:$ref_allele") ? 0 : 1;
            $nr_string .= ("\t" . scalar(@{$ra_coverage})); # kludgy!!
            print "MPG_DIV\t$ref_name\t$left_flank_end:$right_flank_start\t$ref_allele\t$mpg\t$score\t$nr_string\n";
        }
    }
}

=pod

=head1 NAME

B<bam2mpg> - Most probable genotype program for predicting variants and genotypes from alignments of short reads in BAM format.

=head1 SYNOPSIS

B<bam2mpg> I<ref.fasta> I<aln.sorted.bam>

=head1 DESCRIPTION

This script uses samtools to process a BAM formatted file (http://samtools.sourceforge.net) and call genotypes and confidence scores across a covered region.

For a set of aligned allele observations, the MPG ("Most Probable Genotype") algorithm is used to calculate the posterior probability of every possible diploid genotype (or single-allele genotypes for regions specified with the --single_copy option, e.g., on the non-PAR regions of the X and Y chromosome in a male).  The statistical model uses base quality scores to calculate the probability of base-calling errors, and assumes a single prior probability for any non-homozygous-reference genotype.

=head1 MPG INPUT

The first argument to bam2mpg is the path of a fasta-formatted file for the reference sequence.  This fasta file must have a corresponding samtools index file with the same name except for an appended ".fai".

The second argument to bam2mpg is the path of a BAM-formatted file of aligned sequencing reads.  This file must be sorted and indexed using samtools prior to running bam2mpg.

=head1 MPG OUTPUT

The standard output of the program contains lines with eight tab-separated fields.  These fields are:

=over 5

=item B<variant type>

The variant type can be "MPG_SNV", which indicates a single base change at the position specified by the second and third fields, or "MPG_DIV" which indicates a deletion or insertion occurring between the "flanking" positions separated by a colon in the third field.

=item B<chromosome>

This is the name of the entry in the fasta reference sequence passed as the first argument (and of the matching reference entry in the BAM file).

=item B<position>

For an SNV, the position reported is the actual position of the nucleotide change.  For DIV's, this field contains a colon-separated pair of positions, which represent the flanking positions surrounding the largest variable region in the sequence.  So, for example, in a variable-length run of T's, the flanking positions would be the positions of the non-T characters outside of the run, and the alleles reported in the fourth and fifth fields would be the T's between these flanking positions.

=item B<reference allele>

This is the base or bases seen in the reference sequence either at the specified position (for an SNV) or between the reported flanking positions (for a DIV).  When the flanking positions are adjacent, so there are no bases between them, a "*" is reported to enable splitting on white space rather than tabs.

=item B<genotype>

The genotype reported is the genotype with the highest posterior probability according to Bayes theorem, given the observed reads and quality scores, according to the program's error model.  For SNV's, the two alleles are concatenated, so, for example "AT" indicates one A and one T.  For DIV's, the two alleles are separated by a colon, with a "*" indicating an allele of zero bases.

When the --single_copy option is used, single allele genotypes are reported, so no colon-separation is used in the DIV.  This lack of a colon in MPG_DIV genotypes, as well as a single-character genotype at MPG_SNV positions, is what distinguishes "single_copy" output.

=item B<score>

The score field contains the MPG or MPV score.  By default, the score reported is the difference between the natural logarithms of the most probable and second most probable genotype's probabilities.  So, for example, a score of 10 would imply that the reported genotype was approximately 22,000 times as probable as the next most probable genotype.  Since bam2mpg will call genotypes at any base position, variant or not, we recommend using a score cutoff of 10 to avoid a high level of false positive predictions of variation.

For cases where the user is only interested in the probability that some variant exists at a position, rather than the exact genotype (heterozygous or homozygous), the --score_variant option can be used.  With --score_variant, the reported score is the difference between the natural logarithms of the most probable genotype and the homozygous reference genotype, so it represents the relative probability that something other than a homozygous reference genotype is present.

=item B<ref/non-ref field>

This field is 0 for a homozygous-reference genotype, and 1 for anything else, allowing easy extraction of non-reference genotype lines with "awk".

=item B<coverage>

This field reports the number of reads used to calculate the most probable genotype.  It does not include bases that have been filtered for quality (with --qual_filter) or reads beyond the 200 maximum reads the program allows.

=back

=head1 MPG OPTIONS

=over 5

=item B<--region> I<chr:start-end>

This option specifies a region as a reference entry optionally followed by a position range, and causes variants to be called only in that region.

=item B<--qual_filter> I<minimum_quality>

This option specifies a minimum base quality score necessary for a base to be included in the calculation for a particular aligned position.  Bases with quality scores below this value will be completely ignored.  At GTB, bam2mpg is almost always run with --qual_filter 20. (Default: 0)

=item B<--single_copy> I<chr1:start1-end1,chr2:start2-end2>

This option specifies regions for which only a single copy exists in the genome, so that only one allele is expected to be seen.  The regions should be comma-separated without spaces, and in the same format as expected by the --region option.

=item B<--score_variant>

This flag option turns on "MPV" scoring, which reports how much more probable it is that ANY variant genotype is present.  Note that this score will be zero for any position the MPG algorithm calls as homozygous reference, and non-zero for any position that is called with a non-reference genotype.

=item B<--indels>

This flag option causes the script to skip SNV predictions and only report DIV variants.

=item B<--no_indels>

This flag option causes the script to skip DIV predictions and only report SNV variants.

=item B<--only_nonref>

This flag option causes the script to only print lines that predict genotypes that are non homozygous reference.

=item B<--align_bias> I<bias_value>

This option specifies an additional expected percentage of aligned bases that are expected to be the reference allele due to bias in the alignment favoring the reference base.  For example, if the alignment bias has value .05, mpg will expect a GT heterozygous position with reference base "G" to have roughly 55% G's aligned at that position, and 45% T's.  It can also be used to tilt the expected percentages due to included probe sequence, which will always be reference, but in the long run it would be better to have a position-dependent alignment bias that only changed these expected values where the probes are located.  (Default: 0)

=item B<--ds_coverage> I<min_bases>

This option specifies a minimum number of bases that must be seen on each strand for that base's counts to be included in the probability calculation.  For example, if -ds_coverage is specified as 1, and an aligned "T" is observed multiple times on the forward strand, but never on the reverse strand, no T's will be included in the calculation because T was not seen at least once on the reverse strand.  This option is dangerous in that it can artificially amplify scores by eliminating errors, so its use is discouraged.

=item B<--nocache>

This flag option prevents bam2mpg from caching the SNV genotype calls it sees at each site.  This caching is meant to speed up run-time when genotypes are being called genome-wide, but because the caching doesn't consider exact base qualities, it can lead to slightly inaccurate scoring (but generally not actual genotype errors, especially when --qual_filter is used).

=back

=head1 AUTHOR

 Nancy F. Hansen - nhansen@mail.nih.gov

=head1 LEGAL

This software/database is "United States Government Work" under the terms of
the United States Copyright Act.  It was written as part of the authors'
official duties for the United States Government and thus cannot be
copyrighted.  This software/database is freely available to the public for
use without a copyright notice.  Restrictions cannot be placed on its present
or future use. 

Although all reasonable efforts have been taken to ensure the accuracy and
reliability of the software and data, the National Human Genome Research
Institute (NHGRI) and the U.S. Government does not and cannot warrant the
performance or results that may be obtained by using this software or data.
NHGRI and the U.S.  Government disclaims all warranties as to performance,
merchantability or fitness for any particular purpose. 

In any work or product derived from this material, proper attribution of the
authors as the source of the software or data should be made, using "NHGRI
Genome Technology Branch" as the citation. 

=cut

