#!/usr/bin/perl -w
#
# pfam2pli.pl -- Parses PFAM flat file to extract one family and stuff
# into a .pli file (see pli.dtd for specification), including possible
# user-supplied annotations
# Example:
#    ./pfam2pli.pl -i PF00069 | xsltproc indent.xsl - > kinase.pli


use strict;
use Getopt::Std;
use XML::DOM;
use FF_Index;

#global constants
my $PFAM = '/scrg/bee/db/Pfam20/xaa';
my $GOA = '/scrg/bee/db/goa/gene_association.goa_uniprot';
my $TREE = '/scrg/bee/db/Pfam20/domain.pnh';

#-----------------------------------------------------------------------------
# PRE : pfam family id OR optional Stockholm formatted aln file
# (ie. use *instead* of PFAM)
# POST: hash containing info about proteins found in this family
# (specifically, alignment & accession)
sub ParsePFAM($$$) {
    my ($familyId, $optAlnFile, $proteins) = @_;

    my ($pfam, $inFILE);
    if (defined $optAlnFile) {
	open INFILE, $optAlnFile or die "'$optAlnFile':  $!";
	$inFILE = \*INFILE;
    } else {
	$pfam = FF_Index->new($PFAM);
	$inFILE = $pfam->GetRecord($familyId);
	if (!defined $inFILE) {
	    die "Could not find '$familyId' in $PFAM.\n";
	}
    }

    # parse our family
    while (<$inFILE>) {
	last if (/^\/\//); # // marks end of entry
	next if (/^$/); #annoying blank lines

	if (/^\#=GS .+\/[\d\-\s]+AC .+$/) { #accession line
	    my ($id, $acc) = /^\#=GS (.+)\/[\d\-\s]+AC (.+)$/;
	    $$proteins{$id}{'ProteinName'} = $id;
	    $$proteins{$id}{'ProteinNumber'} = $acc;
	} elsif (!/^\#/) { # alignment lines are only ones w/o pound symbol
	    my ($id, $aln) = /^([^\/]+?)(?:\/[\d\-]+)?\s+(.+)$/;
	    $aln =~ s/\./-/g; #new cleanup routine, added by PLFJ
	    $aln = uc($aln);  #new cleanup routine, added by PLFJ
	    $$proteins{$id}{'Alignment'} = $aln;
	    if (!exists $$proteins{$id}{'ProteinName'}) {
		$$proteins{$id}{'ProteinName'} = $id;
		$$proteins{$id}{'ProteinNumber'} = $id;
	    }
	}
    }

}

#-----------------------------------------------------------------------------
# PRE : hash containing info about proteins, string containing type of
# GOA annotation to match (F|P|C)
# POST: hash containing GO annotations for any prots found in $GOA file
sub ParseGOA($$) {
    my ($proteinsRef, $goaType) = @_;
    
    #for format, see ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README
    die "Invalid GOA type" if $goaType !~ /^F|P|C$/;
    my $goa = FF_Index->new($GOA);

    my $c = 0;
    foreach my $prot (keys(%$proteinsRef)) {
	print STDERR int(100 * ++$c/scalar(keys(%$proteinsRef))), "%\r";

	my $inFILE = $goa->GetRecord($prot);
	if (defined $inFILE) {
	    while (<$inFILE>) {
		my @F = split /\t/;
		last if $F[2] ne $prot;
		if ($F[3] eq ''  &&  $F[8] eq $goaType) {
		    push @{$$proteinsRef{$prot}{'GONumber'}}, $F[4] =~ /GO:(\d+)/;
		    push @{$$proteinsRef{$prot}{'MOC'}}, $F[6];
		}
	    }
	}
    }
}

#-----------------------------------------------------------------------------
# PRE : hash containing info about proteins, user input filename with
#       line format <prot num>\t<goterm>/<goterm> ...
# POST: hash containing GO annotations for any prots found in $GOA file
sub ParseAnnotations($$) {
    my ($proteinsRef, $annotFile) = @_;
    
    open(INFILE, $annotFile) 
	|| die "Cannot find annotation file: ".$annotFile."\n";

    my %inFile;
    while(<INFILE>) {
	my @F = split /[\,\s]+/;
        #print "Got .".$F[0].", ".$F[1].".\n";
	if (defined @F) {
	    my $name = "";
	    foreach my $goNum (@F) {
		if($name eq "") {
		    $name = $F[0];
		} else {
		    push @{$inFile{$F[0]}}, $goNum;
		    #print $goNum." added\n";
		}
	    }
	} else {
	    print "didn't find protein ".$F[0]."\n";
	}
    }
    foreach my $prot (keys(%$proteinsRef)) {
	my $protID = $$proteinsRef{$prot}{'ProteinNumber'};
	if (exists $inFile{$protID}) {
	    foreach my $goNum (@{$inFile{$protID}}) {
		push @{$$proteinsRef{$prot}{'GONumber'}}, $goNum;
		push @{$$proteinsRef{$prot}{'MOC'}}, "TAS";
	    }
	}
    }
}

#-----------------------------------------------------------------------------
# PRE : current family ID
# POST: species tree for this family
sub ParseSpeciesTree($$) {
    my ($familyId, $speciesTree) = @_;

    my $trees = FF_Index->new($TREE);
    my $inFILE = $trees->GetRecord($familyId);

    if (defined $inFILE) {
	$$speciesTree = <$inFILE>;
	chomp $$speciesTree;
    } else {
	die "Could not find '$familyId' in $TREE.\n";
    }
}

#-----------------------------------------------------------------------------
# PRE : XML node, name of new element, text contents of new element
# POST: child node added named $key with $value inside it
sub AddElement($$$) {
    my ($parent, $key, $value) = @_;

    my $node = $parent->getOwnerDocument()->createElement($key);
    $node->appendChild($parent->getOwnerDocument()->createTextNode($value));
    $parent->appendChild($node);
}

#-----------------------------------------------------------------------------
# PRE : current protein family XML node, info about one protein
# POST: protein info added to family
sub AddProtein($$) {
    my ($fam, $proteinInfo) = @_;

    my $prot = $fam->getOwnerDocument()->createElement("Protein");
    $fam->appendChild($prot);

    foreach my $tag ('ProteinName', 'ProteinNumber', 'GONumber', 'MOC',
		     'Alignment') {
	if (exists ($$proteinInfo{$tag})) {
	    if (ref($$proteinInfo{$tag}) eq 'ARRAY') {
		AddElement($prot, $tag,
			   '['.join(", ", @{$$proteinInfo{$tag}}).']');
	    } else {
		AddElement($prot, $tag, $$proteinInfo{$tag});
	    }
	}
    }

}

#-----------------------------------------------------------------------------
# Main
{
    my %opts;
    $opts{g} = 'F';
    getopts('i:g:p:u:n:', \%opts) && exists($opts{'i'}) or die
	"Usage: $0 -i <pfam accession> [-g F|P|C] [-p <stockholm align>]\n".
	"	-g --> type of GOA annotations to use (default == F)\n".
	"	       F = molecular function, P = biological process, C = cellular component\n".
	"	[-p] --> lets you use your own Stockholm-formatted alignment\n".
	"	[-n] --> don't use the GOA annotations\n".
	"	[-u] --> lets you input your own set of GO annotations)\n\n";

    my $familyId = $opts{'i'};
    my %proteins;
    my $speciesTree;

    print STDERR "Parsing PFAM...\n";
    ParsePFAM($familyId, $opts{p}, \%proteins);
    if(!exists($opts{'n'})) {
	print STDERR "Parsing GOA...\n";
	ParseGOA(\%proteins, $opts{g});
    }
    if(exists($opts{'u'})) {
	print STDERR "Parsing user annotations...\n";
	ParseAnnotations(\%proteins, $opts{u});
    }
    print STDERR "Parsing TREE...\n";
    ParseSpeciesTree($familyId, \$speciesTree);


    # Stuff into XML objects
    print STDERR "Creating XML...\n";

    my $doc = new XML::DOM::Document;
    my $fam = $doc->createElement("Family");
    $doc->appendChild($fam);

    AddElement($fam, "FamilyID", $familyId);
    foreach my $proteinInfo (values(%proteins)) {
	AddProtein($fam, $proteinInfo);
    }
    AddElement($fam, "SpeciesTree", $speciesTree);


    # Dump XML
    print $doc->toString();#$doc->printToFile ("out.xml");

    $doc->dispose(); #clean up memory (!)
}
