#!/usr/bin/env perl
#
=TODO
    starting from parse reads, no need to pass %metadata as it is
    embeded in %hits_in__chr
=cut

use warnings;
use strict;
use Data::Dumper;
use 5.016;
use lib '/home/nicolas/scripts';
use Carp;
use Getopt::Long;
use File::Basename qw( fileparse );
use File::Copy;
use POSIX qw(floor ceil);
use List::Util qw(max min);
use Time::Progress;
use JSON;
use Bio::Sam::Utilities;
use Bio::Utilities;

$Data::Dumper::Sortkeys  = sub { [ sort{$a cmp $b}  keys %{$_[0]}] };

run( @ARGV ) unless caller();

sub run{
    my $metadata_ref = {
        'bin_size' => 10,
        'schema'   => 'span',
        'datasets' => {},
    };
    my $help     = '';
    my $single   = '';
    my $paired   = '';
    GetOptions (
        'bin_size=i' => \$metadata_ref->{'bin_size'},
        'schema=s'   => \$metadata_ref->{'schema'},
        'help'       => \$help,
        'single'     => \$single,
        'paired'     => \$paired,
    );
    usage() if $help;

    my @sam_files = @ARGV;
    usage('At least one input file is required') if @sam_files == 0 || $help;

    foreach my $sam (@sam_files){
        usage('Cannot find file:' . $sam . "\n") if ! -f $sam;
    }

    my %hits_in_chr;

    foreach my $sam_file (@sam_files){
        my($filename, $directories, $suffix) = fileparse($sam_file, qr/\.[^.]*/);

        parse_header($sam_file, $metadata_ref);

        # presize bin array for each chr
        if(scalar keys %hits_in_chr == 0){
            presize_hit_arrays(\%hits_in_chr, $metadata_ref, $filename);
        }

        if($metadata_ref->{'datasets'}{$filename}{'seq_type'} eq 'paired' || $paired){
            parse_paired_reads($sam_file, \%hits_in_chr, $metadata_ref);
        }
        elsif($metadata_ref->{'datasets'}{$filename}{'seq_type'} eq 'single' || $single){
            parse_single_reads($sam_file, \%hits_in_chr, $metadata_ref);
        }
        else{
            die "Cannot deduct sequencing type from sam header, expecting single or paired type.\nUse --single or --paired to set the sequencing type.";
        }

    }
    say Dumper $metadata_ref;

    Bio::Utilities::data_to_json($metadata_ref, File::Spec->catfile( 'result_metadata.json'));

    Bio::Utilities::data_to_json(\%hits_in_chr, File::Spec->catfile('result_bin-size' . $metadata_ref->{'bin_size'} . '_hits.json') );
    return;
}

sub add_metadata{
    my($hits_in_chr_ref, $metadata_ref) = @_;
    $hits_in_chr_ref->{'metadata'} = {
        'bin_size' => $metadata_ref->{'bin_size'},
        'dataset'  => $metadata_ref->{'dataset'},
        'chromosome_length' => $metadata_ref->{'chromosome_length'},
    };
    return;
}

sub presize_hit_arrays{
    my($hits_in_chr_ref, $metadata_ref, $dataset_name) = @_;

    my $metadata = $metadata_ref->{'datasets'}{ $dataset_name };
    print 'Creating bins...';
    foreach my $chr (sort {$a cmp $b} keys %{ $metadata->{'chromosome_length'} } ) {
        my $bins = ceil( $metadata->{'chromosome_length' }{ $chr } / $metadata_ref->{'bin_size'} );
        @{ $hits_in_chr_ref->{$chr} } = (0) x $bins;
    }
    say 'Done';

    return 1;
}

sub parse_header{
    my($filepath, $metadata_ref) = @_;
    my($filename, $directories, $suffix) = fileparse($filepath, qr/\.[^.]*/);

    my $dataset_metadata_ref = {
        'seq_type' => '',
    };

    print 'Parsing header... ';

    open(my $INFILE, '<', $filepath) || die "Can't open $filepath for reading!\n";
    while( my $line = <$INFILE> ){
        if( $line !~ '^@' ){ # get read length and exit
            my($qname, $flag, $rname, $pos, $mapq, $cigar, $rnext, $pnext, $tlen, $seq) = split(/\t/, $line);
            $dataset_metadata_ref->{'read_length'} = length($seq);
            last;
        }
        chomp($line);
        if( $line =~ m/\A [@]SQ [\s]+
            SN:([\S]+)  [\s]+   # chromosome identifier
            LN:([\S]+) (.*)     # chromosome length
        /xms){
            my($chromosome, $length) = ($1, $2);
            $dataset_metadata_ref->{'chromosome_length'}{ $chromosome } = $length;
        }
        elsif( $line =~ m/\A [@]PG [\s]+
            .+
            CL:bwa [\s]+ ([\S]+) # bwa call
        /xms){
            $metadata_ref->{'seq_type'} = $1 eq 'samse' ? 'single' : 'paired';
        }
    }
    close $INFILE;
    $metadata_ref->{'datasets'}{ $filename } = $dataset_metadata_ref;
    say 'Done';
    return 1;
}

sub parse_single_reads{
    my($filepath, $hits_in_chr_ref, $metadata_ref) = @_;
    my($filename, $directories, $suffix) = fileparse($filepath, qr/\.[^.]*/);
    my $dataset_metadata_ref = $metadata_ref->{'datasets'}{ $filename };

    # reads might span over several bins if bin size < read length
    my $bin_span = ceil( $dataset_metadata_ref->{'read_length'} / $metadata_ref->{'bin_size'} );

    # initialize completion counter
    my $read_count = `wc -l < $filepath  | tr -d ' '`;
    chomp ($read_count);
    my $p = Time::Progress->new(min => 0, max => $read_count);
    my $read_lines = 0;

    open(my $INFILE, '<', $filepath) || die "Can't open $filepath for reading!\n";
    while( my $line = <$INFILE> ){
        $read_lines++;
        print STDERR $p->report("\r%20b  [completed]: %p elapsed: %L",  $read_lines );

        next if $line =~ '^@';

        chomp($line);
        process_single($hits_in_chr_ref, $metadata_ref, $line, $bin_span, $dataset_metadata_ref);
    }
    close $INFILE;
    say ' Done';
    return 1;
}

sub process_single{
    my($hits_in_chr_ref, $metadata_ref, $line, $bin_span, $dataset_metadata_ref) = @_;
    my $strict = 1;

    my($qname, $flag, $rname, $pos, $mapq, $cigar) = split(/\t/, $line);

    if($flag & $flag_value_of{'read_unmapped'}){
        # case 1
        $dataset_metadata_ref->{'unmapped'}++;
    }
    elsif($cigar eq $dataset_metadata_ref->{'read_length'} . 'M'){
        # case 5
        hit_on_read_span(
            $hits_in_chr_ref, $metadata_ref, $dataset_metadata_ref, {
                'chr'      => $rname,
                'pos'      => $pos,
                'length'   => $dataset_metadata_ref->{'read_length'},
                'bin_size' => $metadata_ref->{'bin_size'},
            }
        );
    }
    elsif($strict == 1 && $mapq == 37 ){
        # case 4
        hit_on_read_span(
            $hits_in_chr_ref, $metadata_ref, $dataset_metadata_ref, {
                'chr'      => $rname,
                'pos'      => $pos,
                'length'   => $dataset_metadata_ref->{'read_length'},
                'bin_size' => $metadata_ref->{'bin_size'},
            }
        );
    }
    else{
        # case 2
        $metadata_ref->{'rejected'}++;
        $dataset_metadata_ref->{'rejected'}++;
    }

    return 1;

}

sub parse_paired_reads{
    my($filepath, $hits_in_chr_ref, $metadata_ref, $strict) = @_;
    my($filename, $directories, $suffix) = fileparse($filepath, qr/\.[^.]*/);
    my $dataset_metadata_ref = $metadata_ref->{'datasets'}{ $filename };

    $strict = 0 if ! defined $strict;

    # initialize completion counter
    my $lines = `wc -l < $filepath  | tr -d ' '`;
    chomp ($lines);
    my $p = Time::Progress->new(min => 0, max => $lines);

    my @pair;
    my $current_pair_id;
    my $read_length;
    my $line_count = 0;
    open(my $INFILE, '<', $filepath) || die "Can't open $filepath for reading!\n";

    # skip header and get first read to init buffer
    while( my $line = <$INFILE> ){
        $line_count++;
        next if $line =~ '^@';
        chomp($line);
        my($qname, $flag, $rname, $pos, $mapq, $cigar, $rnext, $pnext, $tlen, $seq) = split(/\t/, $line);
        $current_pair_id = $qname;
        $read_length = length($seq);
        push @pair, $line;
        last; # VERY IMPORTANT
    }

    # read the rest
    while( my $line = <$INFILE> ){
        $line_count++;
        print STDERR $p->report("\r%20b  [completed]: %p elapsed: %L",  $line_count );

        chomp($line);
        my($qname) = split(/\t/, $line);
        if( $qname eq $current_pair_id ){   # add to buffer
            push @pair, $line;
        }
        else{                               # flush buffer and re-init
            process_pair($hits_in_chr_ref, $metadata_ref, $dataset_metadata_ref, $read_length, @pair);
            $current_pair_id = $qname;
            @pair = ($line);
        }
    }
    # process last pair in buffer
    process_pair($hits_in_chr_ref, $metadata_ref, $dataset_metadata_ref, $read_length, @pair);
    close $INFILE;
    say ' Done';
    return 1;
}

sub process_pair{
    my($hits_in_chr_ref, $metadata_ref, $dataset_metadata_ref, $read_length, @pair) = @_;
    my $max_length = 500;
    my $strict = 1;

    $metadata_ref->{'pair_count'}++;
    $dataset_metadata_ref->{'pair_count'}++;

    @pair = process_alternates(@pair) if scalar @pair > 2;

    my($qname1, $flag1, $rname1, $pos1, $mapq1, $cigar1) = split(/\t/, $pair[0]);
    my($qname2, $flag2, $rname2, $pos2, $mapq2, $cigar2) = split(/\t/, $pair[1]);
    my $fragment_size = abs($pos1 - $pos2) + $read_length;
    my $fragment_start = $pos1 <= $pos2? $pos1: $pos2;

    # skip if fragment exceeds max length
    return if fragment_length_exceeded($metadata_ref, $fragment_size, $max_length);

    if($flag1 & $flag_value_of{'read_unmapped'} || $flag1 & $flag_value_of{'mate_unmapped'}){
        # case 1
        $metadata_ref->{'unmapped'}++;
        $dataset_metadata_ref->{'unmapped'}++;
    }
    elsif($rname1 ne $rname2){
        $metadata_ref->{'translocation'}++;
        $dataset_metadata_ref->{'translocation'}++;
    }
    elsif($cigar1 eq $read_length . 'M' && $cigar2 eq $read_length . 'M'){
        # case 5
        hit_on_read_span(
            $hits_in_chr_ref, $metadata_ref, $dataset_metadata_ref, {
                'chr'      => $rname1,
                'pos'      => $fragment_start,
                'length'   => $fragment_size,
                'bin_size' => $metadata_ref->{'bin_size'},
            }
        );
    }
    elsif($strict == 1 && ($mapq1 == 60 && $mapq2 == 60)){
        # case 4
        hit_on_read_span(
            $hits_in_chr_ref, $metadata_ref, $dataset_metadata_ref, {
                'chr'      => $rname1,
                'pos'      => $fragment_start,
                'length'   => $fragment_size,
                'bin_size' => $metadata_ref->{'bin_size'},
            }
        );
    }
    elsif($strict == 2 && ($mapq1 == 60 || $mapq2 == 60)){
        # case 3
        hit_on_read_span(
            $hits_in_chr_ref, $metadata_ref, $dataset_metadata_ref, {
                'chr'      => $rname1,
                'pos'      => $fragment_start,
                'length'   => $fragment_size,
                'bin_size' => $metadata_ref->{'bin_size'},
            }
        );
    }
    else{
        # case 2
        $metadata_ref->{'rejected'}++;
        $dataset_metadata_ref->{'rejected'}++;
    }
    return 1;
}

sub fragment_length_exceeded{
    my($metadata_ref, $size, $max_length) = @_;
    $max_length = 500 if ! defined $max_length;

    if($size > $max_length){
        $metadata_ref->{'oversized'}{'count'}++;
        $metadata_ref->{'oversized'}{'size'}{$size}++;
        return 1;
    }
    return 0;
}

sub process_alternates{
    my(@pair) = @_;

    my $first_in_pair;
    my $second_in_pair;
    my $alt_in_pair;
    foreach my $read (@pair) {
        my($qname, $flag) = split(/\t/, $read);
        if($flag & $flag_value_of{'supplementary_alignment'}){
            $alt_in_pair = $read;
        }
        elsif($flag & $flag_value_of{'first_in_pair'}){
            $first_in_pair = $read;
        }
        elsif($flag & $flag_value_of{'second_in_pair'}){
            $second_in_pair = $read;
        }
    }
    my($qname1, $flag1, $rname1) = split(/\t/, $first_in_pair);
    my($qname2, $flag2, $rname2) = split(/\t/, $second_in_pair);
    my($qname3, $flag3, $rname3) = split(/\t/, $alt_in_pair);

    $second_in_pair = $alt_in_pair if($rname2 ne $rname1 && $rname3 eq $rname1);
    return($first_in_pair, $second_in_pair);
}

# add hit to every bins that the read covers
sub hit_on_read_span{
    my($hits_in_chr_ref, $metadata_ref, $dataset_metadata_ref, $params) = @_;
    $metadata_ref->{'mapped'}{'count'}++;
    $dataset_metadata_ref->{'mapped'}++;
    $metadata_ref->{'mapped'}{'size'}{$params->{'length'}}++;
    my $bin_span = ceil( $params->{'length'} / $params->{'bin_size'} );
    foreach my $bin (0..($bin_span - 1)) {
        $hits_in_chr_ref->{ $params->{'chr'} }[ (position_to_bin($params->{'pos'}, $params->{'bin_size'}) + $bin) ]++;
    }
    return;
}

sub position_to_bin{
    my($pos, $bin_size) = @_;
    return floor( ($pos - 1)/ $bin_size );
}

sub usage{
    my($msg) = @_;
    say ' ';
    say $msg;
    say "usage:  $0 path_to_file(s) [options]";
    say "bins in result file are 0-based";
    say "\n" . 'Options:';
    say "\t --bin_size\t\tINT\tin nucleotide, default [10]";
    say "\t --schema\t\t\tdefines how to count the reads, default ['span']";
    say "\n";
    say "Set sequencing type if it can't be detected in the sam file header";
    say "\t --single or --paired";
    exit;
}

1;
