#!/usr/bin/env perl

#
# ========================================================================
# Script to find presence or absence of regions
# ========================================================================
#

use strict;

use Getopt::Long;
use File::Basename;

# The axt, block and genomesize filename
my ($axtfile, $blockfile, $sizefile);

# Minimum block length
my $minlen = 10;

my %sizeerrors;

#
# ------------------------------------------------------------------------
# Usage
# ------------------------------------------------------------------------
#

if (!@ARGV) {
	my $name = basename $0;
	print <<EOS;
USAGE: $name -fa axtfile -fb blockfile -fs sizefile [-l minlen]

WHERE
    axtfile   - name of the axt file
    blockfile - name of the generated block bed file
    sizefile  - name of the file with genome sizes (for - strand!)
    minlen    - minimum block length
                (default $minlen)

DESCRIPTION
    Find presence or absencse of regions in a two way alignment.  The
    calculation is based on lines in an "axt" file like:

      0 chr1 123 234 chr2 3222 3333 + 111
      1 chr1 235 412 chr2 3396 3597 + 201
      ...

    describing the alignment.  You see that the end of chr1 234 is
    continued on the next line with 235.  That means both fragments
    fit directly together.  On the other side there is a gap from 3333
    to 3396, meaning theere is an insertion in chr2 or other way round
    a deletion in chr1.

    The parameter delta describes the number of nuleotids of
    unpreciseness between end and start.  So the first fragment would
    fit together with the condition 234 < 235 - delta.

    An example

      # $name -fa hg38.panTro5.net.axt -fb calc.clock.bed -fs calc.query.sizes
      0 chr2 sp0 chr1 10918 11386 chrUn_NW_015974624v1 22008 22462 - 34937
      1 chr1 11387 11448 chrUn_NW_015983188v1 1920 1981 + 5767
      2 chr1 11449 14671 chrUn_NW_015974624v1 22463 25700 - 288565
      3 chr1 14672 16562 chrUn_NW_015974624v1 25842 27742 - 171642
EOS
	exit 0;
}

#
# ------------------------------------------------------------------------
# Get parameters and check them
# ------------------------------------------------------------------------
#

exit 1
  if (
	!GetOptions(
		'fa=s' => \$axtfile,
		'fb=s' => \$blockfile,
		'fs=s' => \$sizefile,
		'l=i'  => \$minlen
	)
  );

if (!$axtfile) {
	print qq(Please enter an axt filename!\n);
	exit 1;
}
if (!$blockfile) {
	print qq(Please enter a block filename!\n);
	exit 1;
}
if ($minlen <= 0) {
	print qq(Minimum block length must be > 0!\n);
	exit 1;
}

#
# ------------------------------------------------------------------------
# Read the sizes file
# ------------------------------------------------------------------------
#
sub getSizes {
	my ($filename) = @_;

	my %sizes;

	if (-r $filename) {
		open(F, "<", $filename);
		while (my $line = <F>) {
			my ($id, $size) = split(/\s+/, $line);
			$sizes{$id} = $size;
		}
		close(F);
	}

	return \%sizes;
}

#
# ------------------------------------------------------------------------
# "Reverse" the size of start and end
# ------------------------------------------------------------------------
#
sub revsize {
	my ($start, $end, $sizes, $chr) = @_;

	my $chrsize = $sizes->{$chr};

	# Change start and end
	my $x = $start;
	$start = $end;
	$end   = $x;

	# Recalulate from the chromosome size
	$_[0] = $chrsize - $start + 1;
	$_[1] = $chrsize - $end + 1;

	return 1;
}

#
# ------------------------------------------------------------------------
# Open axt file and read the positions
# ------------------------------------------------------------------------
#
sub main {
	my $sizes = getSizes($sizefile);

	my ($index, $chr1, $start1, $end1, $chr2, $start2, $end2, $strand, $score);

	open(AXT,   "<", $axtfile) or die qq(Cannot open $axtfile!);
	open(BLOCK, ">", $blockfile);

	while (my $line = <AXT>) {
		if ($line =~ m/(\d+)\s+([^\s]+)\s+(\d+)\s+(\d+)\s+([^\s]+)\s+(\d+)\s+(\d+)\s+([\+\-])\s+(\-?\d+)/) {
			($index, $chr1, $start1, $end1, $chr2, $start2, $end2, $strand, $score) =
			  ($1, $2, $3, $4, $5, $6, $7, $8, $9);

			# If two gaps: skip
			next if ($end1 - $start1 < $minlen && $end2 - $start2 < $minlen);

			# Check if second chromosome has size
			next if (!$sizes->{$chr2});

			# If the second strand is reverse
			if ($strand eq "-" && !revsize($start2, $end2, $sizes, $chr2)) {
				$sizeerrors{$chr2} = 1;
				next;
			}

			print BLOCK "$index $chr1 $start1 $end1 $chr2 $start2 $end2 $strand $score\n";
		}
	}

	close(AXT);
	close(BLOCK);

	if (%sizeerrors) {
		print "Error: No size info for ", join(",", sort keys %sizeerrors), "\n";
	}
}

main();
