#!/usr/bin/perl -w

# pli2tree.pl -- Takes single pli XML file, passes through PAUP* to
# generate gene tree, extracts species tree from PLI, passes gene &
# species trees through FORESTER and dumps reconciled tree in New
# Hampshire eXtended format to stdout.
#
# Usage:
#    ./pli2tree.pl -i deaminase.pli > deaminase.nhx

use strict;
use Getopt::Std;
use File::Temp;
use XML::DOM;
use Tree::DAG_Node;
no warnings 'recursion'; #big trees can led to deep recursion

#-----------------------------------------------------------------------------
# PRE : pfam species tree in string form ("psuedo-New Hampshire format")
# POST: reference to tree populated in memory
sub ParseTreeString($) {
    my ($treeStrRef) = @_;

    my $root = new Tree::DAG_Node();
    my $curNode = $root;
    $root->name("root");

    my @treeStr = split //, $$treeStrRef;
    for (my $i=0;  $i < @treeStr  &&  $treeStr[$i] ne ';';  ++$i) {
	if ($treeStr[$i] eq '(') {
	    $curNode = $curNode->new_daughter();
	} elsif ($treeStr[$i] eq ')') {
	    die if !defined $curNode->mother();
	    $curNode = $curNode->mother();
	} elsif ($treeStr[$i] eq ',') {
	    die if !defined $curNode->mother();
	    $curNode = $curNode->mother()->new_daughter();
	} elsif ($treeStr[$i] eq '[') { #brackets enclose comment or NHX
	    for (;  $i < @treeStr  &&  $treeStr[$i] ne ']';  ++$i) {}
	} else {
	    my $j;
	    for ($j=$i;  $j < @treeStr  &&  $treeStr[$j] ne ':';  ++$j) {}
	    my $name = join('',@treeStr[$i..$j-1]);
	    $i = $j+1;
	    for ($j=$i;  $j < @treeStr  &&  $treeStr[$j] =~ /[\d\.-]/; ++$j) {}
	    my $branchLen = join('',@treeStr[$i..$j-1]);

	    #print "$name\t$branchLen\n";
	    $curNode->name($name);
	    $curNode->attributes()->{'br_len'} = $branchLen;
	    $i = $j-1 if ($i < $j);
	}
    }

    return $root;
}

#-----------------------------------------------------------------------------
# PRE : node, reference to string
# POST: string contains "New Hampshire eXtended" formatted tree (only names leaf nodes)
sub Tree2NHX($$);
sub Tree2NHX($$) {
    my ($node, $resultRef) = @_;

    if ($node->daughters() == 0) {
	$$resultRef .= $node->name().':'.$node->attributes()->{'br_len'}.
	    '[&&NHX]';
    } else {
	my @children = $node->daughters();
	$$resultRef .= '(' if (@children > 1);
	for (my $childI = 0;  $childI < @children;  ++$childI) {
	    if ($childI != 0) {
		$$resultRef .= ',';
	    }
	    Tree2NHX($children[$childI], $resultRef);
	}
	if (@children > 1) {
	    $$resultRef .= ')';
	    $$resultRef .= ':'.$node->attributes()->{'br_len'} if exists $node->attributes()->{'br_len'};
	    $$resultRef .= '[&&NHX]';
	}
	if (!defined $node->mother()) {
	    $$resultRef .= ';';
	}
    }
}

#-----------------------------------------------------------------------------
# PRE : node, hash reference
# POST: a few stats about this node's topology (useful for debugging!)
sub CalcTreeStats($$);
sub CalcTreeStats($$) {
    my ($node, $stats) = @_;

    if (exists $node->attributes()->{'br_len'}  &&
	$node->attributes()->{'br_len'} == 0) {
	$stats->{'zeros'}++;
    }

    if ($node->daughters() == 0) {
    } else {
	for (my $childI = 0;  $childI < $node->daughters();  ++$childI) {
	    CalcTreeStats(($node->daughters())[$childI], $stats);
	}
	$stats->{scalar($node->daughters()).'-furcate'}++;
    }
}

#-----------------------------------------------------------------------------
# PRE : node with >2 daughters
# POST: daughters 2->n added as "psuedo-nodes" (branch length 0)
sub BinarizeNode($) {
    my ($node) = @_;

    if ($node->daughters() <= 2) { # node already binarized!
	return;
    }

    my @children = $node->daughters();
    @children = @children[0..$#children-1]; #first child can stay where it is
    $node->remove_daughters(@children);
    for (my $childI = 0;  $childI < @children-1;  ++$childI)  {
	$node = $node->new_daughter();
	$node->attributes()->{'br_len'}=0;
	$node->add_daughter($children[$childI]);
    }
    $node->add_daughter($children[@children-1]);
}

# -----------------------------------------------------------------------------
# BLOSUM50 scoring matrix yanked from BLAST & then converted to a cost
# matrix via the formula: C[i,j] = -(2*S[i,j] - S[i,i] - S[j,j]).  See
# also R script: 'score2cost.r'
sub PrintCostMatrix($) {
    my ($OUTFILE) = @_;
    print $OUTFILE "\tusertype BLOSUM50 (stepmatrix)=22
A R N D C Q E G H I L K M F P S T W Y V * -
0 16 14 17 20 14 13 13 19 12 14 13 14 19 17 8 10 26 17 10 16 10
16 0 16 19 28 12 13 21 17 20 18 7 18 21 23 14 14 28 17 18 18 12
14 16 0 11 24 14 13 15 15 18 20 13 18 23 21 10 12 30 19 18 18 12
17 19 11 0 29 15 10 18 20 21 21 16 23 26 20 13 15 33 22 21 19 13
20 28 24 29 0 26 25 27 29 22 22 25 24 25 31 20 20 38 27 20 24 18
14 12 14 15 26 0 9 19 15 18 16 9 14 23 19 12 14 24 17 18 18 12
13 13 13 10 25 9 0 20 16 19 17 10 17 20 18 13 13 27 18 17 17 11
13 21 15 18 27 19 20 0 22 21 21 18 21 24 22 13 17 29 22 21 19 13
19 17 15 20 29 15 16 22 0 23 21 16 19 20 24 17 19 31 14 23 21 15
12 20 18 21 22 18 19 21 23 0 6 17 8 13 21 16 12 26 15 2 16 10
14 18 20 21 22 16 17 21 21 6 0 17 6 11 23 16 12 24 15 8 16 10
13 7 13 16 25 9 10 18 16 17 17 0 17 22 18 11 13 27 18 17 17 11
14 18 18 23 24 14 17 21 19 8 6 17 0 15 23 16 14 24 15 10 18 12
19 21 23 26 25 23 20 24 20 13 11 22 15 0 26 19 17 21 8 15 19 13
17 23 21 20 31 19 18 22 24 21 23 18 23 26 0 17 17 33 24 21 21 15
8 14 10 13 20 12 13 13 17 16 16 11 16 19 17 0 6 28 17 14 16 10
10 14 12 15 20 14 13 17 19 12 12 13 14 17 17 6 0 26 17 10 16 10
26 28 30 33 38 24 27 29 31 26 24 27 24 21 33 28 26 0 19 26 26 20
17 17 19 22 27 17 18 22 14 15 15 18 15 8 24 17 17 19 0 15 19 13
10 18 18 21 20 18 17 21 23 2 8 17 10 15 21 14 10 26 15 0 16 10
16 18 18 19 24 18 17 19 21 16 16 17 18 19 21 16 16 26 19 16 0 6
10 12 12 13 18 12 11 13 15 10 10 11 12 13 15 10 10 20 13 10 6 0
\t;\n";
    print $OUTFILE "\tTYPESET * BLOSUM50 = BLOSUM50: all;\n";
}

#-----------------------------------------------------------------------------
# PRE : PLI document, output filename to be embedded in PAUP script,
# output file handle
# POST: NEXUS file created (for input to PAUP*)
sub CreateNEXUSFile ($$$) {
    my ($doc, $outFileName, $OUTFILE) = @_;

    my $names = $doc->getElementsByTagName("ProteinName");
    my $alns  = $doc->getElementsByTagName("Alignment");

    print $OUTFILE "#NEXUS\n\n";

    # list all proteins as "taxa"
    print $OUTFILE "begin taxa;\n";
    print $OUTFILE "\tdimensions ntax=", $names->getLength(), ";\n";
    print $OUTFILE "\ttaxlabels\n";
    for (my $i=0;  $i < $names->getLength();  ++$i) {
	print $OUTFILE "\t\t".$names->item($i)->getFirstChild()->getNodeValue(),
	              "\n";
    }
    print $OUTFILE "\t;\n";
    print $OUTFILE "end;\n\n";

    # dump alignments for these proteins
    print $OUTFILE "begin characters;\n";
    print $OUTFILE "\tdimensions nchar=", length($alns->item(0)->getFirstChild()->getNodeValue()), ";\n";
    print $OUTFILE "\tformat gap=- datatype=protein;\n";
    print $OUTFILE "\tmatrix\n";
    for (my $i=0;  $i < $names->getLength();  ++$i) {
	print $OUTFILE "\t\t",
              $names->item($i)->getFirstChild()->getNodeValue(), "\t\t",
	      $alns->item($i)->getFirstChild()->getNodeValue(), "\n";
    }
    
    print $OUTFILE "\t;\n";
    print $OUTFILE "end;\n\n";

    #assumptions block (BLOSUM50 matrix in our case)
    print $OUTFILE "begin assumptions;\n";
    PrintCostMatrix($OUTFILE);
    print $OUTFILE "end;\n\n";

    #paup commands
    print $OUTFILE "begin paup;
	set criterion=parsimony maxtrees=100 increase=no;
	nj showtree=no;
[	hsearch addseq=random rearrlim=;]
	set outroot=paraphyl;
	savetrees file=$outFileName replace brlens=yes root=yes from=1 to=1 format=ALTNEX;
	quit;
";
    print $OUTFILE "end;\n";
    
}

#-----------------------------------------------------------------------------
# PRE : PLI document, output file handle
# POST: alignments dumped in Stockholm format
sub CreateStockholmFile ($$) {
    my ($doc, $OUTFILE) = @_;

    my $names = $doc->getElementsByTagName("ProteinName");
    my $alns  = $doc->getElementsByTagName("Alignment");

    print $OUTFILE "# STOCKHOLM 1.0\n";
    for (my $i = 0;  $i < $names->getLength();  ++$i) {
	print $OUTFILE "#=GS ".
	    $names->item($i)->getFirstChild()->getNodeValue(), "\n";
    }

    #NOTE: spec allows lines of any length, but quicktree can't handle lines > 4096 chars
    my $alnLength = $alns->getLength() ?
	length($alns->item(0)->getFirstChild()->getNodeValue()) : 0;
    my $offset = 0;
    while ($offset < $alnLength) {
	if ($offset > 0) {
	    print $OUTFILE "\n"; #block boundary
	}

	for (my $i = 0;  $i < $names->getLength();  ++$i) {
	    print $OUTFILE
		$names->item($i)->getFirstChild()->getNodeValue(), "\t",
		substr($alns->item($i)->getFirstChild()->getNodeValue(),
		       $offset, 4000), "\n";
	}
	$offset += 4000;
    }
}

#-----------------------------------------------------------------------------
# PRE : PLI document, output file handle
# POST: alignments dumped in PHYLIP format
sub CreatePHYLIPFile ($$) {
    my ($doc, $OUTFILE) = @_;

    my $alns  = $doc->getElementsByTagName("Alignment");

    print $OUTFILE $alns->getLength(), "\t",
          length($alns->item(0)->getFirstChild()->getNodeValue()), "\n";

    for (my $i=0;  $i < $alns->getLength();  ++$i) {
	my $n = $i; #use numbers instead of names because capped at 10 chars
	$n = (length($n) > 10) ? substr($n, 0,10) : $n.(' ' x (10-length($n)));
	print $OUTFILE $n, $alns->item($i)->getFirstChild()->getNodeValue(),
	      "\n";
    }
}

#-----------------------------------------------------------------------------
# PRE : PLI document, tree string from PHYLIP
# POST: tree string with node numbers replaced by gene names
sub SubstitutePHYLIPNames ($$) {
    my ($doc, $oldStr) = @_;
    my $newStr;

    my $names = $doc->getElementsByTagName("ProteinName");
    
    while ((my $p = index($oldStr, ':')) >= 0) {
	my $before = substr($oldStr, 0, $p+1);
	if ($before =~ /\d:$/ ) { #digit immediate before ':'?
	    my ($pre, $id) = $before =~ /(.*)(\d+):$/;
	    $newStr .= $pre.$names->item($id)->getFirstChild()->getNodeValue().
		':';
	} else {
	    $newStr .= $before;
	}
	$oldStr = substr($oldStr, $p+1);
    }

    return $newStr . $oldStr;
}

#-----------------------------------------------------------------------------
# PRE : PLI document, method of building tree, output handle for gene tree
# POST: gene tree created & saved in "New Hampshire" format
sub CreateGeneTree ($$$) {
    my ($doc, $treeMethod, $OUTFILE) = @_;
    my ($treeStr);
    my ($ALNFILE, $alnFile) = File::Temp::tempfile(UNLINK => 1);

    if ($treeMethod eq 'paup') {
	my (undef, $nexusOutFile) = File::Temp::tempfile(UNLINK => 1);
	CreateNEXUSFile($doc, $nexusOutFile, $ALNFILE); #aka paup input file 
	close $ALNFILE or die;
	system("paup -n $alnFile 1>&2") and die;
	($treeStr) = `grep '^tree ' $nexusOutFile` =~ /(\(.+\;)/;

    } elsif ($treeMethod eq 'phylip') {
	CreatePHYLIPFile($doc, $ALNFILE);
	close $ALNFILE or die;
	my (undef, $dOut) = File::Temp::tempfile(UNLINK => 1);
	my (undef, $tOut) = File::Temp::tempfile(UNLINK => 1);
	system("touch outfile"); #annoying hack to specify outfile name
	system("touch outtree");
	system("echo -e \"$alnFile\\nf\\n$dOut\\nr\\n2\\ny\\n\" | protdist > /dev/null") == 0 or die;
	system("echo -e \"$dOut\\nf\\n/dev/null\\nr\\n2\\n3\\ny\\nf\\n$tOut\\nr\\n\" | neighbor > /dev/null") == 0 or die;
	($treeStr) = `paste -s -d '' $tOut`;
	$treeStr = SubstitutePHYLIPNames($doc, $treeStr);
    } else { #treeMethod eq 'quicktree'	
	CreateStockholmFile($doc, $ALNFILE);
	close $ALNFILE or die;
	$treeStr = `quicktree $alnFile`;
	die "quicktree failed" if $? != 0;
	$treeStr =~ s/\n//g;
    }


    die "Error parsing gene tree." if !defined $treeStr;

    print STDERR $treeStr."\n";

    my $root = ParseTreeString(\$treeStr);
    BinarizeNode($root); #FORESTER requires root to be binary
    
    #print map("$_\n", @{$root->draw_ascii_tree});
    $treeStr = '';
    Tree2NHX($root, \$treeStr);
    print $OUTFILE $treeStr, "\n";
}

#-----------------------------------------------------------------------------
# PRE : PLI document, output handle for species tree
# POST: species tree extracted from PLI & converted in New Hampshire format
sub CreateSpeciesTree ($$) {
    my ($doc, $OUTFILE) = @_;
    
    my $speciesTree = $doc->getElementsByTagName("SpeciesTree");
    if ($speciesTree->getLength() != 1) {
	die "Couldn't find species tree in PLI file!\n";
    }

    my $root = ParseTreeString(\$speciesTree->item(0)->getFirstChild()->getNodeValue());

    my $realRoot = $root;
    while ($realRoot->daughters() == 1) {
	$realRoot= ($realRoot->daughters())[0];
    }
    BinarizeNode($realRoot); #FORESTER requires root to be binary

    #print map("$_\n", @{$root->draw_ascii_tree});

    my $nhxStr;
    Tree2NHX($root, \$nhxStr);
    print $OUTFILE $nhxStr, "\n";
}

#-----------------------------------------------------------------------------
# Main
{
    my %opts;
    $opts{'t'} = 'quicktree';
    getopts('i:s:t:', \%opts) && exists($opts{'i'}) or die
	"Usage: $0 -i <pli file> [-s gene|species] [-t paup|phylip|quicktree]\n".
	"	[-s] --> (optional) a single step to run\n".
	"	[-t] --> (optional) a tree building method (default:quicktree)\n\n";

    my $parser = new XML::DOM::Parser;
    my $doc = $parser->parsefile($opts{'i'});

    my ($fh, $geneTmp, $speciesTmp);

    if (!exists $opts{'s'}  ||  $opts{'s'} eq 'gene') {
	if (exists $opts{'s'}) {
	    open $fh, ">&STDOUT";
	} else {
	    ($fh, $geneTmp) = File::Temp::tempfile(UNLINK => 1);
	}
	CreateGeneTree($doc, $opts{'t'}, $fh);
	close $fh;
    }
    if (!exists $opts{'s'}  ||  $opts{'s'} eq 'species') {
	if (exists $opts{'s'}) {
	    open $fh, ">&STDOUT";
	} else {
	    ($fh, $speciesTmp) = File::Temp::tempfile(UNLINK => 1);
	}
	CreateSpeciesTree($doc, $fh);
	close $fh;
    }

    #NOTE: this calls a tweaked version of FORESTER that avoids opening an x-window
    if (!exists $opts{'s'}) {
	my (undef, $recTmp) = File::Temp::tempfile(UNLINK => 1);
	system("java -jar sdi.jar -n $speciesTmp $geneTmp $recTmp 1>&2") == 0
	    or die;
	system("cat $recTmp") == 0 or die;
    }
    
    $doc->dispose(); #clean up memory (!)
}
