#!/usr/bin/env perl

#
# ========================================================================
# Convert bed to fasta format
# ========================================================================
#

use strict;

use Getopt::Long;
use File::Basename;

my ($bedfile, $tfafile, $tfaoutfile, $qfafile, $qfaoutfile);

if (!@ARGV) {
	my $name = basename($0);

	print <<EOS;
USAGE: $name -fb bed-file -ft target-fasta-file -fto target-fasta-file-out -fq query-fasta-file -fqo query-fasta-file-out

DESCRIPTION
    This script takes a bed file (for the coordinates) and two fasta
    files called target-fasta-file and query-fasta-file and generates
    the target and query fasta files based on the coordinates of the
    input file.  It may be called like:

      $name bedfile input.target t input.query q

    The result (files t and q) may look like

      >MSTS01000001.1:8719032:8719033 (len:2)
      CC
      >MSTS01000004.1:26166122:26166123 (len:2)
      CC
      >MSTS01000005.1:18512207:18513235 (len:1029)
      TCCTGAAAGGCAAAGCAGAGGAAATTTGATAATGTTTATTCAAACCGGTGGTGTCATCACAGGCTGGTCTCCAACTTAATGACCATATAACCATTCTATC
      TGCTTTATTACTAGGAAATAACAGAAGTTTAAAGTCATAGCTAAGATACTAGGGACTTCACATTTTAAAATTATTTGGGTACATGTCACTACTGTCACAG
      AAAAAATGACATCTTTTAGGATGACTTTTTTTCTCTGCTACTCATTTCTTGGTTCAAGGGAATATCAGGTCTACATTTATACAATGTACTATCAAATTTA
      TCTTTAGTTTCTATAACTTCTCTTAAGGTGTTAATTAATCTAATGTGGGAAGGATTAACAAAATTAATACTGGGGCATAGGGAAATGCTACCAACCCATT
      TGCTATATTTGTGGTTCTAAAGCTGAGGTTGTTAATGTTCCGAGGAATCTGTGAATAGATTTTAGGGGGTTTGTGAATTTGGATGGGGGAAAAAAATTCC
      ATCTTTAGTTTCCCTAGCCTCTAGAGTATTTCCTTCACTTATGAATGTAGGAGAAAAAAAAAAACCTTATTCTGAGAAGGGTTCCACATCTATAGGCTTC
      ACCAAGCTCCCAAAGGTGTTTATAAAACACAAAAGGATTAAGAACCCTTTGCTTTAAAGCAGAGGTTCCCAAACTTATTTGACCTACTGCTCCTTTTTAA
      AAAAAAATTACTCAGTGTCCCCCTGGGGTTAAGTTAGGTTAACCCTTATTTTTATTCAAGAGCCCCCAATTGCACCCGAGGCTACTATCACCCCCCTGGA
      TAGTTCCAGCACACCCCAAGGGGTGGTATCTCCCACTCTGGGAACCTATGCAAAGGAACATGTAAATACTCCTAAAGGACATAGGGAAAACCTTCTTTTC
      AGTCAGCCCAATGATAGCATTGAAATGTGAATGAAACATGGAGGCGCAACTCTGATTTGAGATAAAATTTCTCATCAATAATGGGAAAGAGGGGGCCTTT
      AAGATTAAGATCTAACTCTACATACTCAC

    The header lines contain following information:

      ">" chromosome ":" start ":" end BLANK "(len:" seq-length ")"
EOS
	exit 0;
}

#
# ------------------------------------------------------------------------
# Get parameters and check them
# ------------------------------------------------------------------------
#

exit 1
  if (
	!GetOptions(
		'fb=s'  => \$bedfile,
		'ft=s'  => \$tfafile,
		'fto=s' => \$tfaoutfile,
		'fq=s'  => \$qfafile,
		'fqo=s' => \$qfaoutfile
	)
  );

if (!$bedfile || !$tfafile || !$qfafile || !$tfaoutfile || !$qfaoutfile) {
	print qq(Please enter 5 filenames!\n);
	exit 1;
}
if (!-r $bedfile || !-r $tfafile || !-r $qfafile) {
	print qq(The bed, target- and/or query-fasta file cannot be read!\n);
	exit 1;
}

#
# ------------------------------------------------------------------------
# Write the parts for a specific chromosome
# ------------------------------------------------------------------------
#
sub writeparts {
	my ($chr, $a) = @_;

	return if (!$a);

	for my $range (@{$a}) {
		my ($start, $end) = @{$range};
		if ($start >= $end) {
			print FASTAOUT ">$chr:$start:$end - ERROR: START >= END!\n";
			next;
		}
		my $s = substr($_[2], $start - 1, $end - $start + 1);
		if ($s) {
			print FASTAOUT ">$chr:$start:$end (len:" . ($end - $start + 1) . ")\n";
			print FASTAOUT join("\n", $s =~ /.{1,100}/gs) . "\n";
		}
	}
}

#
# ------------------------------------------------------------------------
# Read fasta file and generate an output fasta file
# ------------------------------------------------------------------------
#
sub analyze {
	my ($infile, $outfile, $parts) = @_;

	my ($chr, $seq);

	open(FASTAIN,  "<", $infile)  or die qq(Cannot open fasta input file $infile\n);
	open(FASTAOUT, ">", $outfile) or die qq(Cannot open fasta output file $outfile\n);
	while (my $line = <FASTAIN>) {
		chomp($line);
		if ($line =~ m/^>([^\s]+)/) {
			my $x = $1;
			writeparts($chr, $parts->{$chr}, $seq) if ($chr && $seq);
			$chr = $x;
			$seq = "";
		}
		else {
			$seq .= $line;
		}
	}
	writeparts($chr, $parts->{$chr}, $seq) if ($chr && $seq);
	close(FASTAOUT);
	close(FASTAIN);
}

#
# ------------------------------------------------------------------------
# Return the parts data
# ------------------------------------------------------------------------
#
sub getparts {
	my ($file) = @_;

	# The target and query parts
	my (%tparts, %qparts);

	die qq(Cannot read file $file\n) if (!-r $file);
	open(BED, "<", $file);
	while (my $line = <BED>) {
		chomp($line);

		my ($index, $tchr, $tstart, $tend, $qchr, $qstart, $qend, $strand, $score) = split(/\s+/, $line);

		# If the data is from a gap (no score info!) - adjust start and end
		if (!$score) {
			$tstart++;
			$tend--;
			$qstart++;
			$qend--;
		}

		my $x = $tparts{$tchr};
		$tparts{$tchr} = $x = [] if (!$x);
		push(@{$x}, [$tstart, $tend]);

		$x = $qparts{$qchr};
		$qparts{$qchr} = $x = [] if (!$x);
		push(@{$x}, [$qstart, $qend]);
	}
	close(BED);

	return (\%tparts, \%qparts);
}

#
# ------------------------------------------------------------------------
# The main routine
# ------------------------------------------------------------------------
#
sub main {
	my ($tparts, $qparts) = getparts($bedfile);
	analyze($tfafile, $tfaoutfile, $tparts);
	analyze($qfafile, $qfaoutfile, $qparts);
}

main();
