#!/usr/bin/env perl

#
# ========================================================================
# Write fasta data file for the previous generated nway result
# ========================================================================
#

use strict;

use Getopt::Long;
use File::Basename;

use lib '/data/projects/n-way/modules';
use nway::genomesconfig;

nway::genomesconfig::init();

# Disabled, because this is printed BEFORE the nway_calc stdout
# $| = 1;

# Genomes directory
my $genomesdir;

# The previously generated nway results
my $nwayfile;

# The fasta (output) file
my $fastafile;

# The extract extension
my $extractext = 100;

if (!@ARGV) {
	my $name = basename($0);

	print <<EOS;
USAGE
    $name -fn nwayfile -ff fastafile -pg genomespath -e extensionext

WHERE
    nwayfile     - the input nway result file
    fastafile    - the output fasta file
    genomespath  - directory containing the genoms data
    extensionext - the extension extend
                   (default: $extractext)
EOS
	exit 0;
}

exit 1
  if (
	!GetOptions(
		'fn=s' => \$nwayfile,
		'ff=s' => \$fastafile,
		'pg=s' => \$genomesdir,
		'e=i'  => \$extractext
	)
  );

if (!$nwayfile) {
	print qq(Please enter an input (nway) file!\n);
	exit 1;
}
elsif (!-r $nwayfile) {
	print qq(Cannot read input (nway) file!\n);
	exit 1;
}
if (!$fastafile) {
	print qq(Please enter an output (fasta) file!\n);
	exit 1;
}
if (!$genomesdir) {
	print qq(Please enter the genomes directory!\n);
	exit 1;
}
elsif (!-d $genomesdir) {
	print qq(There is no genomes directory $genomesdir!\n);
	exit 1;
}

#
# ------------------------------------------------------------------------
# Returns the chromosome hash value
# ------------------------------------------------------------------------
#
sub chrsizehash {
	my $hash  = 0;
	my $index = 1;
	for my $c (split(//, $_[0])) {
		$hash += (ord($c) - 32) * $index++;
	}
	return $hash % 1999;
}

#
# ------------------------------------------------------------------------
# Return a reference to the sizes array
# ------------------------------------------------------------------------
#
sub getChrSizes {
	my ($syn) = @_;

	my @sizes = [];

	my $file = join("/", $genomesdir, nway::genomesconfig::getSys($syn), "/calc.genome.sizes");
	if (-r $file) {
		open(F, "<", $file);
		while (my $line = <F>) {
			chomp($line);
			my ($chr, $size) = split(/\s+/, $line);
			my $hash = chrsizehash($chr);
			my $a    = $sizes[$hash];
			$sizes[$hash] = $a = [] if (!$a);
			push(@{$a}, [$chr, $size]);
		}
		close(F);
	}

	return \@sizes;
}

#
# ------------------------------------------------------------------------
# Returns the chromosome size - or 0 if not found
# ------------------------------------------------------------------------
#
sub getChrSize {
	my ($sizes, $chr) = @_;

	my $hash = chrsizehash($chr);
	if ($sizes->[$hash]) {
		my $a = $sizes->[$hash];
		for (my $i = 0 ; $i < @{$a} ; $i++) {
			return $a->[$i]->[1] if ($a->[$i]->[0] eq $chr);
		}
	}

	return 0;
}

#
# ------------------------------------------------------------------------
# Extend start and end
# ------------------------------------------------------------------------
#
sub extend {
	my ($start, $end, $size, $extractext) = @_;

	my $xstart = $start - $extractext;
	$xstart = 1 if ($xstart < 1);
	my $xend = $end + $extractext;
	$xend = $size if ($size && $size <= $xend);

	return ($xstart, $xend);
}

#
# ------------------------------------------------------------------------
# The fasta generation part
# ------------------------------------------------------------------------
#
sub main() {
	my @rows;

	print "==> Run samtools\n";

	print "Read nway data\n";

	# The line length of the sequences
	my $len = 100;

	open(F, "<", $nwayfile);
	my $line = <F>;
	chomp($line);
	my @syns = (split("\t", $line));
	while (my $line = <F>) {
		chomp($line);
		my @a = split(/\t/, $line);

		# Check for minus coordinates
		for (my $i = 0 ; $i < @a ; $i++) {
			my @x = split(/\//, $a[$i]);
			if (@x > 2) {
				if ($x[2] =~ m/:-|--/) {
					$a[$i] = "N/$x[1]";
				}
			}
		}

		push(@rows, \@a);
	}
	close(F);

	my $samtoolsfile = "calc.tmp.samtools";
	my $xlocsfile    = "$samtoolsfile.xlocs";
	my $locsfile     = "$samtoolsfile.locs";

	open(FASTA, ">", $fastafile);

	# Loop over all species
	for (my $i = 0 ; $i < @syns ; $i++) {
		my $syn = $syns[$i];

		my $sizes = getChrSizes($syn);
		my $count = 0;

		# Create a location and extended location file - one
		# for samtools and one later to add the original coordinates
		open(XLOCS, ">", $xlocsfile);
		open(LOCS,  ">", $locsfile);
		for (my $k = 0 ; $k < @rows ; $k++) {
			if ($rows[$k]->[$i] =~ m|./.+?/(.+?):(.+?)\-(.+?)/(.)|) {
				my ($chr, $start, $end, $strand) = ($1, $2, $3, $4);

				# Extend range
				my ($xstart, $xend) = extend($start, $end, getChrSize($sizes, $chr), $extractext);

				print XLOCS "$chr:$xstart-$xend\n";
				print LOCS "$chr:$start-$end/$strand\n";

				$count++;
			}
		}
		close(LOCS);
		close(XLOCS);

		# Call samtools, sum up all results and add the species in headers
		my $file = join("/", $genomesdir, nway::genomesconfig::getSys($syn), "calc.genome.fa");
		if (-r qq($file.fai)) {
			print "Run samtools on $syn with $count locations...\n";
			my $starttime = time;
			system(qq(samtools faidx -n $len -r $xlocsfile $file >$samtoolsfile 2>/dev/null));
			print "Duration: " . (time - $starttime) . " seconds\n";
		}
		else {
			print "Cannot read file $file.fai!\n";
		}

		# Concat the original coordinate (to identify the sequence)
		# And don't forget to complement a minus strand!

		my ($header, $seq);
		my $strand = "+";

		open(SAM,  "<", $samtoolsfile);
		open(LOCS, "<", $locsfile);

		while (my $line = <SAM>) {
			chomp($line);
			if ($line =~ m/^>/) {
				if ($header) {
					if ($strand eq "-") {
						$seq =~ tr/acgtACGT/tgcaTGCA/;
						$seq = reverse $seq;
					}
					print FASTA $header, "\n", join("\n", $seq =~ /.{1,$len}/g), "\n";
					$seq = "";
				}

				$line =~ s/^>/>$syn\//;

				my $line2 = <LOCS>;
				chomp($line2);
				$line2 =~ m/(.)$/;
				$strand = $1;
				$line2 =~ s/^.*?://;

				$header = qq($line ($line2));
			}
			else {
				$seq .= $line;
			}
		}

		if ($header) {
			if ($strand eq "-") {
				$seq =~ tr/acgtACGT/tgcaTGCA/;
				$seq = reverse $seq;
			}
			print FASTA $header, "\n", join("\n", $seq =~ /.{1,$len}/g), "\n";
		}

		close(LOCS);
		close(SAM);
	}

	close(FASTA);

	# Delete temporary files
	unlink($samtoolsfile) if (-f $samtoolsfile);
	unlink($locsfile)     if (-f $locsfile);
	unlink($xlocsfile)    if (-f $xlocsfile);
}

main();
