#!/usr/bin/perl -w

# permute_pli.pl -- Takes single pli XML file & integer "distance" to
# permute it; spits out XML with the experimental annotations shuffled
# the specified distance.

# Usage:
#    ./permute_pli.pl -i deaminase.pli -d 2 > shuffled.pli

use strict;
use Getopt::Std;
use XML::DOM;


#-----------------------------------------------------------------------------
# PRE : XML doc
# POST: annotations (GONumber & MOC) loaded for each protein (undef if
# no annot for a particular prot)
sub ExtractAnnotations($$) {
    my ($doc, $annots) = @_;

    foreach my $prot ($doc->getDocumentElement()->getChildNodes()) {
	if ($prot->getNodeName() eq 'Protein') {
	    push @$annots, [undef, undef];
	    foreach my $element ($prot->getChildNodes()) {
		if ($element->getNodeName() eq 'GONumber') {
		    $$annots[$#$annots][0] = $element;
		} elsif ($element->getNodeName() eq 'MOC') {
		    $$annots[$#$annots][1] = $element;
		}
	    }
	}
    }
}

#-----------------------------------------------------------------------------
# PRE : ordered list of annotations(TM), distance to permute
# POST: a new, random ordering of annotations
sub ShuffleAnnotations($$) {
    my ($annots, $dist) = @_;
    my (@trueAnnotPos);

    #collect IDA/IMP/GOR annotations
    for (my $i = 0;  $i < @$annots;  ++$i) {
	push @trueAnnotPos, $i if (defined $$annots[$i][1] &&
				   $$annots[$i][1]->getFirstChild->getNodeValue
				   =~ /(IDA)|(IMP)|(GOR)/);
    }

    #shuffle list of true annots
    for (my $i = 0;  $i < @trueAnnotPos;  ++$i) {
	my $r = int(rand($#trueAnnotPos - $i)) + $i;
	@trueAnnotPos[$i, $r] = @trueAnnotPos[$r, $i];
    }

    if ($dist > @trueAnnotPos) {
	warn "Requested swap distance ($dist) is greater than the number of ".
	     "true (IDA|IMP|GOR) annotations (".scalar(@trueAnnotPos).").\n";
    }

    #insert shuffled annots at random positions
    for (my $i = 0;  $i < $dist  &&  @trueAnnotPos;  ++$i) {
	my $r = int(rand(@$annots - 1));
	my $annotPos = pop @trueAnnotPos;
	@$annots[$r, $annotPos] = @$annots[$annotPos, $r];
    }
}

#-----------------------------------------------------------------------------
# PRE : XML doc, ordered list of annotations
# POST: annotations added to XML *in order*
sub RestoreAnnotations($$) {
    my ($doc, $annots) = @_;

    foreach my $prot ($doc->getDocumentElement()->getChildNodes()) {
	if ($prot->getNodeName() eq 'Protein') {
	    my $annot = shift @$annots;
	    $prot->appendChild($$annot[0]) if (defined $$annot[0]);
	    $prot->appendChild($doc->createTextNode("\n\t"));
	    $prot->appendChild($$annot[1]) if (defined $$annot[1]);
	    $prot->appendChild($doc->createTextNode("\n\t"));
	}
    }
}


#-----------------------------------------------------------------------------
# PRE : XML doc
# POST: duplicate adjacent text elements deleted
sub CleanUpXML($) {
    my ($doc) = @_;

    foreach my $prot ($doc->getDocumentElement()->getChildNodes()) {
	if ($prot->getNodeName() eq 'Protein') {
	    my $txtCnt = 0;
	    foreach my $element ($prot->getChildNodes()) {
		if ($element->getNodeType() == TEXT_NODE) {
		    ++$txtCnt;
		    $prot->removeChild($element) if ($txtCnt > 1);
		} else {
		    $txtCnt = 0;
		}
	    }
	}
    }
}

#-----------------------------------------------------------------------------
# Main
{
    my %opts;
    getopts('i:d:', \%opts) && exists($opts{'i'})  && exists($opts{'d'}) or die
	"Usage: $0 -i <pli file> -d <distance>\n".
	"     -->Distance specifies # of swaps away from original.\n\n";

    my $parser = new XML::DOM::Parser;
    my $doc = $parser->parsefile($opts{'i'});

    my @annots;

    ExtractAnnotations($doc, \@annots);
    ShuffleAnnotations(\@annots, $opts{'d'});
    RestoreAnnotations($doc, \@annots);

    CleanUpXML($doc); #strictly cosmetic

    $doc->printToFileHandle(\*STDOUT);

    $doc->dispose(); #clean up memory (!)
}
