#!/usr/bin/env perl

#
# ========================================================================
# Analyze N times 2 ways
# ========================================================================
#

use strict;

use Getopt::Long;
use File::Basename;

use lib '/data/projects/n-way/modules';
use nway::genomesconfig;

$| = 1;

# ------------------------------------------------------------------------
# Constants
# ------------------------------------------------------------------------

my $GENOMESDIR = qq(/data/databases/genomes);

# ------------------------------------------------------------------------
# Commad line parameters
# ------------------------------------------------------------------------

# The debug level
my $debuglevel = 0;

# The base/target species
my $target;

# The requested species
my $specs_;
my @specs;

# The coordinate file
my $coordfile;

# The base directory to the twoway subdirectories
my $dbdir;

# The output filename
my $outfile;

# Minimum flanks for blocks
my $minflanks = 50;

# Maximum overlap in gaps
my $maxover = 25;

# Minimum gap size
my $mingapsize = 10;

# Minimum Coverage in gaps
my $mincov = 0.7;

# Maximum distance in gaps (reverse)
my $maxdist = 20;

# Maximal difference ratio (reverse)
my $maxdiffratio = 0.3;

# Manimal block flanks
my $minblockflanks = 150;

# For fasta extraction - extend nulcotide start and end by this number
my $extractext = 100;

# ------------------------------------------------------------------------
# Global variables
# ------------------------------------------------------------------------

# Hash (target, spec, type) -> bed-data
my %beds = ();

# Translate type to filename pattern
my %TYPES = (g => "gap", b => "block");

my $CHRHASHSIZE = 499;

#
# ------------------------------------------------------------------------
# Check command line parameters
# ------------------------------------------------------------------------
#
if (!@ARGV) {
	my $name = basename($0);

	print <<EOS;
USAGE
    $name -t target -s spec{,spec} -fc coordfile -db dbdir -fo outfile
    [-l minflanks] [-o maxover] [-c mincov] [-g mingapsize] [-d maxdist] [-r maxdiffratio] 
    [-debug n]

WHERE
    target       - the base/target species
    spec         - species of interest
    coordfile    - file with coordinates
                   line format: spec chr start end
    dbdir        - database/base directory with 2 way alignments (subdirs)
    outfile      - the output filename
    minflanks    - the amount of nucleotides difference on blocks
                   (default: $minflanks)
    maxover      - the maximum overlap in gaps
                   (default: $maxover)
    mincov       - the minimum coverage
                   (default: $mincov)
    mingapsize   - the minimum gapsize, if less the "N"
                   (default: $mingapsize)
    maxdist      - (rev) the maximum distance between preliminary search and actual position
                   (default: $maxdist)
    maxdiffratio - (rev) the maximum difference ratio
                   (default: $maxdiffratio)
    n            - debug info level (0,1,2)
EOS
	exit 0;
}

exit 1
  if (
	!GetOptions(
		't=s'     => \$target,
		's=s'     => \$specs_,
		'fc=s'    => \$coordfile,
		'db=s'    => \$dbdir,
		'fo=s'    => \$outfile,
		'l=i'     => \$minflanks,
		'o=i'     => \$maxover,
		'c=f'     => \$mincov,
		'g=i'     => \$mingapsize,
		'd=i'     => \$maxdist,
		'r=f'     => \$maxdiffratio,
		'debug=i' => \$debuglevel
	)
  );

if (!$target) {
	print qq(Please enter a base/target species!\n);
	exit 1;
}
if (!$specs_) {
	print qq(Please enter at least one species!\n);
	exit 1;
}
if (!$coordfile) {
	print qq(Please enter a coordinate file!\n);
	exit 1;
}
if (!-r $coordfile) {
	print qq(Cannot read coordinate file $coordfile!\n);
	exit 1;
}
if (!$dbdir) {
	print qq(Please enter the twoway base directory!\n);
	exit 1;
}
if (!-r $dbdir) {
	print qq(Cannot read twoway base directory $dbdir!\n);
	exit 1;
}
if (!$outfile) {
	print qq(Please enter an output filename!\n);
	exit 1;
}

@specs = split(",", $specs_);

#
# ------------------------------------------------------------------------
# Returns minimum
# ------------------------------------------------------------------------
#
sub min {
	return $_[0] < $_[1] ? $_[0] : $_[1];
}

#
# ------------------------------------------------------------------------
# Returns maximum
# ------------------------------------------------------------------------
#
sub max {
	return $_[0] > $_[1] ? $_[0] : $_[1];
}

#
# ------------------------------------------------------------------------
# Read the coordinates file and return a reference to an array with
# start, end elements.  If a 2way directory does not exist, return
# empty array.
# ------------------------------------------------------------------------
#
sub prepare {
	my ($file) = @_;

	print "==> prepare($file)\n" if ($debuglevel);

	my @rows;

	open(F, "<", $file);
	while (my $line = <F>) {
		$line =~ s/^\s+//;
		$line =~ s/\s+$//;
		my ($spec, $chr, $start, $end) = split(/\s+/, $line);
		push(@rows, [$spec, $chr, $start, $end]);
	}
	close(F);

	return \@rows;
}

#
# ------------------------------------------------------------------------
# Returns a hash value for the chromosome
#
# Params
# $_[0] - the chromsome name
# $_[1] - the location index (0 or 1)
# ------------------------------------------------------------------------
#
sub chrhash {
	my $hash  = 0;
	my $index = 1;
	for my $chr (split(//, $_[0])) {
		$hash += (ord($chr) - 32) * $index++;
	}
	return $hash % $CHRHASHSIZE + ($_[1] * $CHRHASHSIZE);
}

#
# ------------------------------------------------------------------------
# Reads a bed file and return a reference to a data array
# ------------------------------------------------------------------------
#
my $duration_load = 0;

sub getBeds {
	my ($spec, $type) = @_;

	print "==> getBeds($spec,$type)\n" if ($debuglevel);

	my $key = qq($target.$spec.$type);
	if ($beds{$key}) {
		print "get from buffer\n" if ($debuglevel);
		return $beds{$key};
	}

	my @bed;

	my $file = qq($dbdir/$target/$spec/calc.$TYPES{$type}.bed);
	print "load from file $file\n" if ($debuglevel);
	return \@bed if (!-r $file);

	my $starttime = time;

	open(BED, "<", $file);
	while (my $line = <BED>) {
		chomp($line);
		my @a = split(/\s+/, $line);

		# Remove line count
		@a = splice(@a, 1);

		my $hash = chrhash($a[0], 0);
		my $aa = $bed[$hash];
		$bed[$hash] = $aa = [] if (!$aa);
		push(@{$aa}, \@a);

		$hash = chrhash($a[3], 1);
		$aa = $bed[$hash];
		$bed[$hash] = $aa = [] if (!$aa);
		push(@{$aa}, \@a);
	}
	close(BED);

	$beds{$key} = \@bed;
	$duration_load += (time - $starttime);

	return \@bed;
}

#
# ------------------------------------------------------------------------
# Returns the coverage
#
# Params
# cstart,cend - coordinate start/end
# gstart,gend - gap start/end
# ------------------------------------------------------------------------
#
sub getCoverage {
	my ($cstart, $cend, $gstart, $gend) = @_;

	print "--> getCoverage($cstart,$cend,$gstart,$gend)\n" if ($debuglevel);

	my $res = 0;
	if ($gend - $gstart == 1) {
		$res = 1.0;
	}
	else {
		$res = min($cend, $gend) - max($cstart, $gstart) / ($gend - $gstart - 1);
		$res = 1.0 if ($res > 1.0);
	}

	print "res=$res\n" if ($debuglevel);

	return $res;
}

#
# ------------------------------------------------------------------------
# Recalculate start and end in block
#
# Params
# block  - block data
# tstart - target start
# tend   - target end
# ------------------------------------------------------------------------
#
sub getRange {
	my ($block, $tstart, $tend) = @_;

	my $targetSize = $block->[2] - $block->[1] + 1;
	my $querySize  = $block->[5] - $block->[4] + 1;
	my $flankLeft  = $tstart - $block->[1] + 1;
	my $flankRight = $block->[2] - $tend + 1;
	my $coordSize  = $tend - $tstart + 1;

	# The block start and end
	my ($start, $end);

	if (min($flankLeft, $flankRight) < $minblockflanks) {
		if ($flankLeft < $flankRight) {
			if ($block->[6] eq "+") {
				$start = $block->[4] + $flankLeft;
				$end   = $start + $coordSize;
			}
			else {
				$end   = $block->[5] - $flankLeft;
				$start = $end - $coordSize;
			}
		}
		else {
			if ($block->[6] eq "+") {
				$end   = $block->[5] - $flankRight;
				$start = $end - $coordSize;
			}
			else {
				$start = $block->[4] + $flankRight;
				$end   = $start + $coordSize;
			}
		}
	}
	else {
		# If flanks are long, we should take in account coefficient of stains similarity (css)
		my $css = $querySize / $targetSize;

		# Than we have to find "middle point" from both flanks (according strand value).

		# The midpoint's left and right
		my ($left, $right);

		if ($block->[6] eq "+") {
			$left  = $block->[4] + ($flankLeft * $css);
			$right = $block->[5] - ($flankRight * $css);
		}
		else {
			$left  = $block->[4] + ($flankRight * $css);
			$right = $block->[5] - ($flankLeft * $css);
		}

		my $center = sprintf("%d", ($left + $right) / 2);

		# Now we can calculate start and end of sequence inside block.
		$start = sprintf("%d", $center - $coordSize / 2);
		$end   = sprintf("%d", $center + $coordSize / 2);
	}

	return ($start, $end);
}

#
# ------------------------------------------------------------------------
# Returns the location like: "chr1:start1-end1/chr2:start2-end2/strand"
#
# Params
# bed    - block/gap data
# tstart - target start
# tend   - target end
# ------------------------------------------------------------------------
#
sub getLocation {
	my ($bed, $tstart, $tend) = @_;

	if ($tstart) {
		my ($start, $end) = getRange($bed, $tstart, $tend);
		return qq($bed->[3]:$start-$end/$bed->[6]);
	}
	else {
		return qq($bed->[3]:$bed->[4]-$bed->[5]/$bed->[6]);
	}
}

#
# ------------------------------------------------------------------------
# Analyze Foreward
# ------------------------------------------------------------------------
#
sub analyzeForeward {
	my ($row, $gaps, $blocks) = @_;

	print "==> analyzeForeward(" . join(",", @{$row}) . ")\n" if ($debuglevel);

	my $g = $gaps->[chrhash($row->[1], 0)];

	for (my $k = 0 ; $k < @{$g} ; $k++) {
		my $gap = $g->[$k];

		# Check chromosome in row and target
		next if ($row->[1] ne $gap->[0]);

		# The target and query widths
		my $tw = abs($gap->[2] - $gap->[1] - 1);
		my $qw = abs($gap->[5] - $gap->[4] - 1);

		# Assume insert is in target
		if ($tw > $qw) {
			next if ($tw <= $mingapsize);
			next if ($qw > $mingapsize);

			# Check overlap (it should be smaller or equal than $maxover)
			next if (max(abs($row->[2] - $gap->[1]), abs($gap->[2] - $row->[3])) > $maxover);

			# Check coverage, it should be bigger or equal than $mincov)
			next if (getCoverage($row->[2], $row->[3], $gap->[1], $gap->[2]) < $mincov);

			push(@{$row}, "-/fgq/" . getLocation($gap));
			return;
		}

		# Else insert is in query
		else {
			# Check gap and insert sizes
			next if ($tw <= $mingapsize);
			next if ($qw > $mingapsize);

			# Check that coordinates are around the gap in target.
			next if ($row->[2] >= $gap->[1] && $row->[3] <= $gap->[2]);

			push(@{$row}, "++/fgt/" . getLocation($gap));
			return;
		}
	}

	my $b = $blocks->[chrhash($row->[1], 0)];

	for (my $k = 0 ; $k < @{$b} ; $k++) {
		my $block = $b->[$k];

		# Check chromosome in row and target
		next if ($row->[1] ne $block->[0]);

		next if (min($row->[2] - $block->[1], $block->[2] - $row->[3]) < $minflanks);

		# The target and query widths
		my $tw = abs($block->[2] - $block->[1] - 1);
		my $qw = abs($block->[5] - $block->[4] - 1);

		my $praefix = $tw >= $mingapsize && $qw >= $mingapsize ? "+" : "-";
		push(@{$row}, "$praefix/fb/" . getLocation($block, $row->[2], $row->[3]));
		return;
	}

	push(@{$row}, "N/f");
}

#
# ------------------------------------------------------------------------
# Get preliminary parameters (for reverse search)
#
# Returns
# Reference to array (chromosome, start, end, size)
# ------------------------------------------------------------------------
#
sub getPreliminary {
	my ($row, $gaps) = @_;

	print "==> getPreliminary(" . join(",", @{$row}) . ",?)\n" if ($debuglevel);

	my $g = $gaps->[chrhash($row->[1], 1)];

	for (my $k = 0 ; $k < @{$g} ; $k++) {
		my $gap = $g->[$k];

		# Check chromosome in row and target
		next if ($row->[1] ne $gap->[3]);

		# The target width
		my $tw = abs($gap->[2] - $gap->[1] - 1);

		# Next if target gap is small
		next if ($tw > $mingapsize);

		# Overlap in query
		next if (max(abs($row->[2] - $gap->[4]), abs($gap->[5] - $row->[3])) > $maxover);

		# Coverage in Query
		next if (getCoverage($row->[2], $row->[3], $gap->[4], $gap->[5]) < $mincov);

		# Save target coordinates
		my $prelim = [$gap->[0], $gap->[1], $gap->[2], $row->[3] - $row->[2] + 1];

		print "prelim=" . join(",", @{$prelim}) if ($debuglevel > 1);

		return $prelim;
	}
}

#
# ------------------------------------------------------------------------
# Analyze reverse
# ------------------------------------------------------------------------
#
sub analyzeReverse {
	my ($row, $gaps, $blocks, $prelim) = @_;

	print "==> analyzeReverse(" . join(",", @{$row}) . ",?,?,?)\n" if ($debuglevel);

	# The parameters from preliminary search
	my ($chr, $start, $end, $size) = @{$prelim};

	my $b = $blocks->[chrhash($chr, 0)];

	for (my $k = 0 ; $k < @{$b} ; $k++) {
		my $block = $b->[$k];

		# Check chromosome in row and target
		next if ($chr ne $block->[0]);

		next if ($end < $block->[1]);
		next if ($start > $block->[2]);

		# Check flanks
		next if (min($start - $block->[1], $block->[2] - $end) < $minflanks);

		push(@{$row}, "-/rb/" . getLocation($block, $start, $end));
		return;
	}

	my $g = $gaps->[chrhash($chr, 0)];

	for (my $k = 0 ; $k < @{$g} ; $k++) {
		my $gap = $g->[$k];

		# Check saved chromosome and target
		next if ($chr ne $gap->[0]);

		# The target and query widths
		my $tw = abs($gap->[2] - $gap->[1] - 1);
		my $qw = abs($gap->[5] - $gap->[4] - 1);

		# Check minimum gap size
		next if ($tw > $mingapsize && $qw <= $mingapsize);

		# Check maximum distance
		next if (max(abs($start - $gap->[1]), abs($end - $gap->[2])) > $maxdist);

		# Check maximum diffratio
		next if ((abs($size - $qw) / $size) >= $maxdiffratio);

		push(@{$row}, "+/rg/" . getLocation($gap));
		return;
	}

	push(@{$row}, "N/r");
}

#
# ------------------------------------------------------------------------
# Write the progress of analyzis
# ------------------------------------------------------------------------
#
sub progress {
	my ($index, $count) = @_;

	open(PROGRESS, ">", qq(calc.progress));
	print PROGRESS "Calculate coordinate " . ($index + 1) . " of " . $count;
	close(PROGRESS);
}

#
# ------------------------------------------------------------------------
# Analyze all coordiantes of all species
# ------------------------------------------------------------------------
#
sub analyze {
	my ($rows) = @_;

	print "==> analyze()\n" if ($debuglevel);

	my $count = @{$rows};

	# For each coordinate
	for (my $i = 0 ; $i < $count ; $i++) {
		progress($i, $count);

		my $row   = $rows->[$i];
		my $cspec = $row->[0];

		# Do foreward, if the base species is equal to the species in the coordinate
		if ($target eq $cspec) {

			# Set first column
			my $bed = [$row->[1], $row->[2], $row->[3], $row->[1], $row->[2], $row->[3], "+"];
			push(@{$row}, "+/x/" . getLocation($bed));

			# All other species
			for (my $k = 1 ; $k < @specs ; $k++) {
				my $spec = $specs[$k];
				analyzeForeward($row, getBeds($spec, "g"), getBeds($spec, "b"));
			}
		}

		# Else reverse
		else {
			my $prelim = getPreliminary($row, getBeds($cspec, "g"));
			if (!$prelim) {
				for (my $k = 0 ; $k < @specs ; $k++) {
					push(@{$row}, "N/rp");
				}
				next;
			}

			# Set first column/species
			my $bed = ["", 0, 0, $prelim->[0], $prelim->[1], $prelim->[2], "-"];
			push(@{$row}, "-/x/" . getLocation($bed));

			# All other species
			for (my $k = 1 ; $k < @specs ; $k++) {
				my $spec = $specs[$k];

				# If the preliminary species is the same as in coordiante
				if ($cspec eq $spec) {
					my $bed = [$row->[1], $row->[2], $row->[3], $row->[1], $row->[2], $row->[3], "+"];
					push(@{$row}, "+/rp/" . getLocation($bed));
					next;
				}

				analyzeReverse($row, getBeds($spec, "g"), getBeds($spec, "b"), $prelim);
			}
		}
	}
}

#
# ------------------------------------------------------------------------
# Reads a sizes file for a species
# ------------------------------------------------------------------------
#
sub chrsizehash {
	my $hash  = 0;
	my $index = 1;
	for my $c (split(//, $_[0])) {
		$hash += (ord($c) - 32) * $index++;
	}
	return $hash % 1999;
}

#
# ------------------------------------------------------------------------
# Return a reference to the sizes array
# ------------------------------------------------------------------------
#
sub getChrSizes {
	my ($spec) = @_;

	my @sizes = [];

	my $file = $GENOMESDIR . "/" . nway::genomesconfig::getSys($spec) . "/calc.genome.sizes";
	if (-r $file) {
		open(F, "<", $file);
		while (my $line = <F>) {
			chomp($line);
			my ($chr, $size) = split(/\s+/, $line);
			my $hash = chrsizehash($chr);
			my $a    = $sizes[$hash];
			$sizes[$hash] = $a = [] if (!$a);
			push(@{$a}, [$chr, $size]);
		}
		close(F);
	}

	return \@sizes;
}

#
# ------------------------------------------------------------------------
# Returns the chromosome size - or 0 if not found
# ------------------------------------------------------------------------
#
sub getChrSize {
	my ($sizes, $chr) = @_;

	my $hash = chrsizehash($chr);
	if ($sizes->[$hash]) {
		my $a = $sizes->[$hash];
		for (my $i = 0 ; $i < @{$a} ; $i++) {
			return $a->[$i]->[1] if ($a->[$i]->[0] eq $chr);
		}
	}

	return 0;
}

#
# ------------------------------------------------------------------------
# Write the fasta data file
# ------------------------------------------------------------------------
#
my $duration_fasta = 0;

sub generateFasta {
	my ($rows) = @_;

	print "==> Generate fasta\n" if ($debuglevel);

	my $starttime = time;

	# Extended locations
	my %xlocs;

	# The result lines from all samtools calls
	my @lines;

	# Loop over all species (they are in $rows in the same order, starting column 4)
	for (my $i = 0 ; $i < @specs ; $i++) {
		my $spec = $specs[$i];
		my @locs;

		my $sizes = getChrSizes($spec);

		# Loop over all coordinate/result lines
		for (my $k = 0 ; $k < @{$rows} ; $k++) {
			if ($rows->[$k]->[$i + 4] =~ m|./.+?/(.+?):(.+?)\-(.+?)/|) {
				my ($chr, $start, $end) = ($1, $2, $3);

				my $xstart = $start - $extractext;
				$xstart = 1 if ($xstart < 1);
				my $xend = $end + $extractext;
				my $size = getChrSize($sizes, $chr);
				$xend = $size if ($size && $size <= $xend);

				my $xloc = qq($chr:$xstart-$xend);
				push(@locs, $xloc);
				$xlocs{$xloc} = qq($start-$end);
			}
		}

		# Call samtools, sum up all results and add the species in headers
		my $file = $GENOMESDIR . "/" . nway::genomesconfig::getSys($spec) . "/calc.genome.fa";
		if (-r qq($file.fai)) {
			my $w = 1000;
			for (my $k = 0 ; $k < @locs ; $k += $w) {
				my $locs_ = join(" ", @locs[$k .. $k + $w - 1]);
				my $command = qq(samtools faidx -n 100 $file $locs_ 2>/dev/null);
				my @a = qx($command);
				map {$_ =~ s/^>/>$spec\//;} @a;
				push(@lines, @a);
			}
		}
	}

	open(F, ">", qq(calc.nway.fa));
	for my $line (@lines) {
		if ($line =~ m/^>.+?\/(.+?:.+?\-.+?)\n/) {
			my $key = $1;
			$line =~ s/\n/ ($xlocs{$key})\n/;
		}
		print F $line;
	}
	close(F);

	$duration_fasta += (time - $starttime);
}

#
# ------------------------------------------------------------------------
# Write the nway data file
# ------------------------------------------------------------------------
#
sub generateNway {
	my ($rows) = @_;

	print "==> Generate nway\n" if ($debuglevel);

	open(NWAY, ">", $outfile);

	print NWAY "Coordinates";
	for my $spec (@specs) {
		print NWAY "\t$spec";
	}
	print NWAY "\n";

	for (my $i = 0 ; $i < @{$rows} ; $i++) {
		my @row = @{$rows->[$i]};
		print NWAY "$row[0]/$row[1]:$row[2]-$row[3]";
		for (my $k = 4 ; $k < @row ; $k++) {
			print NWAY "\t", @row[$k];
		}
		print NWAY "\n";
	}

	close(NWAY);
}

#
# ------------------------------------------------------------------------
# The main routine
# ------------------------------------------------------------------------
#
sub main {
	my $starttime = time;

	print "==> Start with target=$target,specs=${specs_}\n" if ($debuglevel);

	# Do the calculation
	my $rows = prepare($coordfile);
	analyze($rows);

	# Generate output
	generateNway($rows);
	generateFasta($rows);

	my $duration = time - $starttime;
	print "==> Durations (seconds)\n";
	print "Load Bed       : $duration_load\n";
	print "Analyze        : " . ($duration - $duration_load - $duration_fasta) . "\n";
	print "Generate Fasta : $duration_fasta\n";
	print "Total          : $duration\n";

	#print "==> Distribution\n";
	#for my $key (keys %beds) {
	#	print "--> $key\n";
	#	for my $a ($beds{$key}) {
	#		my $count = 0;
	#		for (my $i = 0 ; $i < $CHRHASHSIZE * 2 ; $i++) {
	#			if ($a->[$i]) {
	#				print "$i:" . @{$a->[$i]} . ", ";
	#				print "\n" if ((++$count % 10) == 0);
	#			}
	#		}
	#	}
	#}
}

main();
