#!/usr/bin/env perl

#
# ========================================================================
# Change the symbols in nway file calc.nway to a better estimation.
# Therefore take the previously generated fasta file containins
# sequences with extension.
#
# Following steps have to be done:
#
# - Read the calc.nway file to get a mapping of "x/f/chr:start-end/x"
#   to the position(s) in calc.nway.  Use this positon information to
#   read the fasta data and prepare a mapping of an id
#   "syn/chr:start-end" to position(s) in the calc.nway file.  Now we
#   can submit the fasta data of a row from calc.nway to muscle and
#   evaluate the density.  If it changed, we can easily replace the
#   symbol in calc.nway, because we now the mapping of id -> {pos},
#   which was derived previously.
# ========================================================================
#

use strict;

use Getopt::Long;
use File::Basename;

#use lib '/data/projects/n-way/modules';
#use nway::genomesconfig;

#nway::genomesconfig::init();

# Needed for rcol calcualtion
my $COORDSFILE  = qq(input.coords);
my $SPECIESFILE = qq(input.species);

my $MUSCLEFILE = qq(calc.tmp.muscle);

# The previously generated nway results
my $nwayfile;

# The fasta file
my $fastafile;

# Minimum target length
my $tminlen = 10;

# Maximum target length
my $tmaxlen = 1000000;

# Densities array with elements [max-percentage, symbol]
my @densities  = ();
my $sdensities = "10:-,25:-?,75:+?,100:+";

# Number of flanked nucleotids added left and right to the sequence
my $extension;

# The sliding window expansion
my $expansion = 20;

# The step size
my $stepsize = 5;

# Target column for reverse search
my $rcol = -1;

# Debug modus switch
my $debug = 0;

# The data from nway file
my @rows;

# The synonyms from nway file
my @syns;

# ------------------------------------------------------------------------
# Usage info
# ------------------------------------------------------------------------

if (!@ARGV) {
	my $name = basename($0);

	print <<EOS;
USAGE
    $name -fn nwayfile -ff fastafile -r col
    [-tl minlen] [-tm maxlen] [-d densities] [-e extension]
    [-x expansion] [-s stepsize] [-debug]

WHERE
    nwayfile   - the input nway result file
    fastafile  - the fasta file (produced before)
    col        - the target column (used in revere serach - starts with 0!)
    minlen     - the minimum target length
                 (default: $tminlen)
    maxlen     - the maximum target length
                 (default: $tmaxlen)
    densities  - the densities for symbols like 10:-,30:-?,100:+
                 (default: $sdensities)
    extension  - number of flanked nucleotids added left and right to the
                 sequence
                 (default: $extension)
    expansion  - the sliding window expansion to left and right
                 (default: $expansion)
    stepsize   - the sliding window step size
                 (default: $stepsize)
    debug      - debug mode
                 (default: off)
EOS
	exit 0;
}

# ------------------------------------------------------------------------
# Get and check parameters
# ------------------------------------------------------------------------

exit 1
  if (
	!GetOptions(
		'fn=s'  => \$nwayfile,
		'ff=s'  => \$fastafile,
		'tl=i'  => \$tminlen,
		'tm=i'  => \$tmaxlen,
		'd=s'   => \$sdensities,
		'e=i'   => \$extension,
		'x=i'   => \$expansion,
		's=i'   => \$stepsize,
		'r=i'   => \$rcol,
		'debug' => \$debug
	)
  );

if (!$nwayfile) {
	print qq(Please enter an input (nway) file!\n);
	exit 1;
}
elsif (!-r $nwayfile) {
	print qq(Cannot read input (nway) file!\n);
	exit 1;
}

if (!$fastafile) {
	print qq(Please enter fasta file!\n);
	exit 1;
}
elsif (!-r $fastafile) {
	print qq(Cannot read fasta file!\n);
	exit 1;
}

# Get extension from fasta file if not given on command line
if (!$extension) {
	my $count = 0;
	open(F, "<", $fastafile);
	while (my $line = <F>) {
		last if ($count++ > 1000);
		if ($line =~ m|^>.*?/.*?:(\d+)\-\d+.*?\((\d+)\-\d+|) {
			my ($xstart, $start) = ($1, $2);
			if ($xstart > 0) {
				$extension = $start - $xstart;
				last;
			}
		}
	}
	close($fastafile);
}
if (!$extension) {
	print "Extension is missing or cannot be derived from fasta file!";
	exit 1;
}

# Expansion is at maximum like extension
$expansion = $extension if ($expansion > $extension);

# Reads reverse column (target in reverse search) if not given
if ($rcol == -1) {
	if (-f $COORDSFILE && -f $SPECIESFILE) {
		open(F, "<", $COORDSFILE);
		my $line = <F>;
		close(F);

		$line =~ m/([^\s]+)/;
		my $syn = $1;

		open(F, "<", $SPECIESFILE);
		$line = <F>;
		close(F);

		my @a = split(/,/, $line);
		for (my $i = 1 ; $rcol == -1 && $i < @a ; $i++) {
			$rcol = $i if ($a[$i] eq $syn);
		}
	}
	if ($rcol == -1) {
		print "Cannot derive reverse column!";
		$rcol = 0;
	}
}

# Convert density string to array
$sdensities =~ s/\s+//g;
for my $s (split(",", $sdensities)) {
	my @a = split(/:/, $s);
	$a[0] = sprintf("%.3f", $a[0] / 100.0);
	push(@densities, \@a);
}

#
# ------------------------------------------------------------------------
# Return the mapping id "syn/chr:start-end"-> [position]
#
# Note that the id is NOT unique - it may point to many positions!
# ------------------------------------------------------------------------
#
sub getId2Pos {
	my %id2pos;

	for (my $row = 0 ; $row < @rows ; $row++) {
		for (my $i = 0 ; $i < @syns ; $i++) {
			if ($rows[$row]->[$i] =~ m|^.*?/.*?/(.*?\:\d+\-\d+)|) {
				my $id = "$syns[$i]/$1";
				my $a  = $id2pos{$id};
				$id2pos{$id} = $a = [] if (!$a);
				push(@$a, "$row-$i");
			}
		}
	}

	return \%id2pos;
}

#
# ------------------------------------------------------------------------
# Extract fasta seqeunces
# ------------------------------------------------------------------------
#
sub extract {
	print "\n==> Extract referenced sequences from fasta file...\n";

	my $starttime = time;

	my %seqs;
	my $id2pos = getId2Pos();

	my ($pos, $seq) = (undef, "");
	open(F, "<", $fastafile);
	while (my $line = <F>) {
		chomp($line);
		if ($line =~ m|^>(.*?)/(.*?):.*?\((\d+)\-(\d+)|) {
			my ($syn, $chr, $start, $end) = ($1, $2, $3, $4);
			if ($pos) {
				map {$seqs{$_} = $seq} @$pos;
			}
			$pos = $id2pos->{"$syn/$chr:$start-$end"};
			$seq = "";
		}
		else {
			$seq .= $line;
		}
	}
	if ($pos) {
		map {$seqs{$_} = $seq} @$pos;
	}
	close(F);

	print "Duration: " . (time - $starttime) . " seconds\n";

	return \%seqs;
}

#
# ------------------------------------------------------------------------
# Return the cottected target's start and end after alignment.  Meaning
# to adjust by integration of gaps.
# ------------------------------------------------------------------------
#
sub getTargetRange {
	my ($seq) = @_;

	my @a = split(//, $seq);
	my ($start, $end) = (0, length($seq) - 1);

	my $diff = 0;
	for (my $i = 0 ; $i < @a && $i < $extension + $diff ; $i++) {
		$diff++ if ($a[$i] eq "-");
	}
	$start += $diff;

	$diff = 0;
	for (my $i = @a - 1 ; $i >= 0 && $end - $i < $extension + $diff ; $i--) {
		$diff++ if ($a[$i] eq "-");
	}
	$end -= $diff;

	return ($start + $extension, $end - $extension);
}

#
# ------------------------------------------------------------------------
# Extracts the target sequence from the muscle output
# ------------------------------------------------------------------------
#
sub getTargetSeq {
	my ($id, $lines) = @_;

	my ($seq, $found) = ("", 0);

	for my $line (@$lines) {
		chomp($line);
		if ($line =~ m/^>/) {
			return $seq if ($found);
			$found = 1 if (index($line, $id) >= 0);
		}
		elsif ($found) {
			$seq .= $line;
		}
	}

	return $found ? $seq : "";
}

#
# ------------------------------------------------------------------------
# Calculate density of a sequence.  Returns -1 if not possible.
# ------------------------------------------------------------------------
#
sub getDensity {
	my ($seq, $start, $end) = @_;

	my $s     = substr($seq, $start, $end - $start);
	my $count = $s =~ tr/ACGNTacgnt//;

	my $len = length($s);
	return $len == 0 ? -1 : sprintf("%.3f", $count / $len);
}

#
# ------------------------------------------------------------------------
# Replace symbol in nway data related to density value
# ------------------------------------------------------------------------
#
sub adjustSeq {
	my ($seq, $row, $col, $start, $end, $tdensity) = @_;

	# Don't do on target column and check target density!
	return if ($col == $rcol || $tdensity <= 0.0);

	# Minimum density
	my $density = 2.0;

	if ($extension > 0) {

		# Recalculate start, end based on expansion
		my $xstart = $start - $expansion;
		my $xend   = $end - $expansion;

		# Loop over "windows"
		while ($xend < $end + $expansion) {
			my $d = getDensity($seq, $xstart, $xend) / $tdensity;
			$density = $d if ($d >= 0.0 && $d < $density);
			$xstart += $stepsize;
			$xend   += $stepsize;
		}
	}
	else {
		$density = getDensity($seq, $start, $end) / $tdensity;
	}

	if ($density >= 0.0 && $density < 2.0) {
		for my $a (@densities) {
			if ($density < $a->[0]) {
				$rows[$row]->[$col] =~ m|^(.*?)/|;
				my $symbol = $1;
				if ($symbol && $symbol ne $a->[1]) {
					printf("Change at [%d,%d] from %s to %s, density=%.2f%%\n",
						$row, $col, $symbol, $a->[1], $density * 100);
					$rows[$row]->[$col] =~ s|^.*?/|$a->[1]/|;
				}
				return;
			}
		}
	}
}

#
# ------------------------------------------------------------------------
# Adjust sequences - meaning: replace symbols in query data
# ------------------------------------------------------------------------
#
sub adjustSeqs {
	my ($lines, $row, $id2col, $start, $end, $tdensity) = @_;

	my ($id, $seq) = ("", "");
	for my $line (@$lines) {
		chomp($line);
		if ($line =~ m/^>(.+?:.+?\-.+)/) {
			my $newid = $1;
			adjustSeq($seq, $row, $id2col->{$id}, $start, $end, $tdensity) if ($id);
			$id  = $newid;
			$seq = "";
		}
		else {
			$seq .= $line;
		}
	}
	adjustSeq($seq, $row, $id2col->{$id}, $start, $end, $tdensity) if ($id);
}

#
# ------------------------------------------------------------------------
# Checks the table row for
# - target length between min and max
# - the number of valid entries with coordinates (>1)
# - query sizes (max-query-size + 2 * ext) * 10 < target-size + 2 * ext
# ------------------------------------------------------------------------
#
sub checkRow {
	my ($row, $rcol) = @_;

	# Extension to both sides
	my $x = 2 * $extension;

	# The maximum query and the target length and the count of valid ids
	my ($qlen, $tlen, $count) = (0, 0, 0);

	my @cols = @{$rows[$row]};

	# Get and check target length (without extension)
	if ($cols[$rcol] =~ m|./.+?/.+?:(.+?)\-(.+?)/|) {
		$tlen = $2 - $1 + 1;
		return 0 if ($tlen < $tminlen || $tlen > $tmaxlen);
		$tlen += $x;
	}

	# Get query lengths
	for (my $i = 0 ; $i < @cols ; $i++) {
		if ($cols[$i] =~ m|./.+?/.+?:(.+?)\-(.+?)/|) {
			my $len = $2 - $1 + 1 + $x;
			$qlen = $len + $x if ($i != $rcol && $len > $qlen);
			$count++;
		}
	}

	return $count > 1 && $qlen < $tlen * 10 ? 1 : 0;
}

#
# ------------------------------------------------------------------------
# Align all rows (sequence data) with muscle and adjust symbols
# ------------------------------------------------------------------------
#
sub align {
	my ($seqs) = @_;

	print "\n==> Align rows data with muscle...\n";

	my $starttime = time;
	my $time      = $starttime;

	for (my $row = 0 ; $row < @rows ; $row++) {
		if (time - $time >= 10) {
			printf("%.1f%% done...\n", $row / @rows * 100);
			$time = time;
		}

		next if (!checkRow($row, $rcol));

		# Get the id -> position (=row,col) mapping and write
		# the muscle input
		my %id2col;
		open(F, ">", $MUSCLEFILE);
		for (my $i = 0 ; $i < @syns ; $i++) {
			my $id = $rows[$row]->[$i];
			print F ">", $id, "\n", $seqs->{"$row-$i"}, "\n";
			$id2col{$id} = $i;
		}
		close(F);

		# Run muscle
		my @lines = qx(sh -c "muscle -maxiters 1 -diags -in $MUSCLEFILE 2>/dev/null");

		if ($debug) {
			print "Muscle output\n";
			for my $line (@lines) {
				print $line;
			}
		}

		# Now adjust symbols on muscle output
		my $tseq = getTargetSeq($rows[$row]->[$rcol], \@lines);
		my ($start, $end) = getTargetRange($tseq);
		my $tdensity = getDensity($tseq, $start, $end);
		if ($tdensity <= 0.0) {
			printf("Invalid density of '%s' [%s]!\n", $tseq, $rows[$row]->[$rcol]);
		}
		else {
			adjustSeqs(\@lines, $row, \%id2col, $start, $end, $tdensity);
		}
	}

	unlink($MUSCLEFILE) if (-f $MUSCLEFILE);

	print "Duration: " . (time - $starttime) . " seconds\n";
}

#
# ------------------------------------------------------------------------
# Loads rows and syns from nway file
# ------------------------------------------------------------------------
#
sub init {
	open(F, "<", $nwayfile);
	my $line = <F>;
	chomp($line);
	@syns = split(/\t/, $line);
	while ($line = <F>) {
		chomp($line);
		my @a = split(/\t/, $line);
		push(@rows, \@a);
	}
	close(F);

	if ($rcol >= @syns) {
		print "Revers column > columns in nway data file!";
		exit 1;
	}
}

#
# ------------------------------------------------------------------------
# Writes data back
# ------------------------------------------------------------------------
#
sub finalize {
	rename($nwayfile, qq($nwayfile.old));
	open(F, ">", qq($nwayfile));
	print F join("\t", @syns), "\n";
	for (my $row = 0 ; $row < @rows ; $row++) {
		print F join("\t", @{$rows[$row]}), "\n";
	}
	close(F);
}

#
# ------------------------------------------------------------------------
# main
# ------------------------------------------------------------------------
#
sub main {
	my $starttime = time;

	print "==> Parameters\n";
	printf("nwayfile=%s\ntminlen=%d\ntmaxlen=%d\ndensities=%s\nextension=%d\nstepsize=%d\n",
		$nwayfile, $tminlen, $tmaxlen, $sdensities, $extension, $stepsize);

	init();
	align(extract());
	finalize();

	print "Total time: " . (time - $starttime) . " seconds\n";
}

main();
