#! /usr/bin/perl

################################################################################
### COPYRIGHT ##################################################################

# New York Genome Center

# SOFTWARE COPYRIGHT NOTICE AGREEMENT
# This software and its documentation are copyright (2017) by the New York
# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center
# cannot be responsible for its use, misuse, or functionality.

# Version: 0.1
# Author: Wayne Clarke

################################################################# /COPYRIGHT ###
################################################################################

use strict;
use warnings;
use Getopt::Long;

main();

sub main
{
	my $tax_tbl;
	my $infile;
	my $surp = '';
	my $help = '';
	my $verb = '';

	GetOptions("taxonomy_table|tt=s" => \$tax_tbl,
				"input|i=s" => \$infile,
				"help|h" => \$help);
	
	if ($help) {
		usage();
	}

	# taxonomy table is required
	if (!defined $tax_tbl || !-f $tax_tbl) {
		print STDERR "Error with taxonomy table $tax_tbl!\n";
		usage();
	}

	# must specify an input file
	if (!defined $infile || !-f $infile) {
		print STDERR "Error with input file $infile!\n";
	}

	my %ranks = ("species" => 0, 
				 "genus" => 1,
				 "family" => 2,
				 "order" => 3,
				 "class" => 4,
				 "phylum" => 5,
				 "kingdom" => 6,
				 "root" => 7);

	my $taxonomy = parse_taxtbl($tax_tbl, \%ranks);

	# Print the header table of classification counts
	print "Rank\tCorrect\tIncorrect\tUnclassified\tSensitivity\tPrecision\tF-score\n";

	count_classifications($infile, \%ranks, $taxonomy);
}

sub usage
{
	print $0, "\n";
	print "\t" . "--taxonomy_table | -tt <taxonomy table>                          : required - table to get taxonomy paths from\n";
	print "\t" . "--input | -i                                                     : required - tab delimited classification file - format: <read> <classification taxid> <truth taxid>\n"; 
	print "\t" . "--help | -h                                                      : optional - shows usage\n";
	
	exit;
}

sub count_classifications
{
	my ($file, $ranks, $taxonomy) = @_;

	open(my $fh, "<", $file) or die "Can't open $file for reading:$!";

	my @classCounts; # 2d array with 8 rows per file (one per classification in %ranks) and 3 columns (Classified_Correctly, Classified_Incorrectly, Unclassified) 

	while (my $line = <$fh>) {
		chomp($line);

		my ($readID, $classificationTax, $truthTax)  = split(/\t/, $line);
	
		if (!defined $readID || !defined $classificationTax || !defined $truthTax) {
			die "Error in input file format line: $line\n";
		}

		# Unclassified at all ranks
		if ($classificationTax == 0) {
			# Increment unclassified count for all classification ranks
			for (my $i = 0; $i <= 7; $i++) {
				$classCounts[$i][2]++;
			}
		}
		# Classified correctly
		elsif ($truthTax eq $classificationTax) {
			my $rank = $taxonomy->{$truthTax}{'class_rank'};
			my $rank_num = $ranks->{$rank};
	
			# Increment appropriate count for each classification rank
			# For this case we are counting classfied correctly and unclassified @ rank level 
			for (my $i = 0; $i <= 7; $i++) {
				# Read is unclassified at this classification rank
				# i.e. $rank = genus and $i = species
				if ($i < $rank_num) {
					$classCounts[$i][2]++;
				}
				# Read is classified at this classification rank
				else {
					$classCounts[$i][0]++;
				}
			}
		}
		# Initial classification check not an exact match
		elsif ($truthTax ne $classificationTax) {
			my $truthPath = $taxonomy->{$truthTax}{'path'};
			my $classificationPath = $taxonomy->{$classificationTax}{'path'};
	
			# Read classification on the Classifier taxonomic path
			# Classifier classification more specific
			if ($classificationPath =~ /:$truthTax:/ || $classificationPath =~ /^$truthTax:/) {
				# Increment appropriate count for each classification rank
				# For this case all rank levels are correctly classified 
				for (my $i = 0; $i <= 7; $i++) {
					$classCounts[$i][0]++;
				}
	
			}
			# Classifier classification on the read taxonomic path
			elsif ($truthPath =~ /:$classificationTax:/ || $truthPath =~ /^$classificationTax:/) {
				my $cRank = $taxonomy->{$classificationTax}{'class_rank'};
				my $rank_num = $ranks->{$cRank};
	
				# Increment appropriate count for each classification rank
				# For this case we are counting classfied correctly and misclassified @ read rank level 
				for (my $i = 0; $i <= 7; $i++) {
					# Read is unclassified at this classification rank
					# i.e. $rank = genus and $i = species
					if ($i < $rank_num) {
						$classCounts[$i][2]++;
					}
					# Read is classified correctly at this classification rank
					else {
						$classCounts[$i][0]++;
					}
				}
	
			}
			# Search for a common part of the taxonomic path
			else {
				my @parts = split(/:/, $taxonomy->{$truthTax}{'path'});
				
				# Remove the read taxid from the path
				pop(@parts);
	
				# Check the path to the read's taxonomic parent(s) against the Classifier path 
				my $path_length = scalar(@parts);
				for(my $i = 0; $i < $path_length; $i++) {
					my $newPath = join(":", @parts);
					my $pid = pop(@parts);
	
					if ($classificationPath eq $newPath || $classificationPath =~ /^$newPath:/) {
						my $common_rank = $taxonomy->{$pid}{'class_rank'};
						my $rank_num = $ranks->{$common_rank};
	
						# Increment appropriate count for each classification rank
						# For this case we are counting classfied incorrectly and classified correctly @ common rank level 
						for (my $j = 0; $j <= 7; $j++) {
							# Read is misclassified at this classification rank
							# i.e. $rank = genus and $i = species
							if ($j < $rank_num) {
								$classCounts[$j][1]++;
							}
							# Read is classified correctly at this classification rank
							else {
								$classCounts[$j][0]++;
							}
						}
						last;
					}
				}
			}
		}
	}

	print_classification_summary(\@classCounts, $ranks);
}

sub print_classification_summary
{
	my ($classCounts, $ranks) = @_;

	foreach my $classRank (sort {$ranks->{$a} <=>$ranks->{$b}} keys %$ranks) {
		my $rankNum = $ranks->{$classRank};

		my $correct = 0;
		my $incorrect = 0;
		my $unclassified = 0;
		if (defined $classCounts->[$rankNum][0]) {
			$correct = $classCounts->[$rankNum][0];
		}
		if (defined $classCounts->[$rankNum][1]) {
			$incorrect = $classCounts->[$rankNum][1];
		}
		if (defined $classCounts->[$rankNum][2]) {
			$unclassified = $classCounts->[$rankNum][2];
		}

		my $sensitivity = $correct/($correct+$incorrect+$unclassified);
		my $precision = $correct/($correct+$incorrect);
		print join("\t", ($classRank, $correct, $incorrect, $unclassified, $sensitivity, $precision, 2*($precision*$sensitivity/($precision+$sensitivity)))), "\n"; 
	}
}

sub parse_taxtbl
{
	my ($file, $ranks) = @_;

	open(my $fh, "<", $file) or die "Can't open $file for reading:$!";

	my %taxonomy;
	my @norank;
	while (my $line = <$fh>) {
		chomp($line);

		my ($taxid, $rank, $j1, $path) = split(/\t/, $line);

		my $class_rank;
		if ($taxid == 1) {
			$class_rank = "root";
		}
		elsif (exists $ranks->{$rank}) {
			$class_rank = $rank;
		}
		elsif ($rank =~ /tribe/) {
			$class_rank = "family";
		}
		elsif ($rank eq "varietas" || $rank eq "forma") {
			$class_rank = "species";
		}
		elsif ($rank eq "superfamily") {
			$class_rank = "order";
		}
		elsif ($rank eq "superorder") {
			$class_rank = "class";
		}
		elsif ($rank eq "superclass") {
			$class_rank = "phylum";
		}
		elsif ($rank eq "superphylum") {
			$class_rank = "kingdom";
		}
		elsif ($rank eq "superkingdom") {
			$class_rank = "kingdom";
		}
		elsif ($rank =~ /(.*?) .*group$/) {
			$class_rank = "genus"
		}
		elsif ($rank =~ /^sub(.*)/ || $rank =~ /^infra(.*)/ || $rank =~ /^parv(.*)/ ) {
			$class_rank = $1;
		}
		elsif ($rank eq "no rank") {
			$class_rank = $rank;
			push(@norank, $taxid);
		}
		else {
			die "Could not determine classification rank for $rank\n";
		}

		$taxonomy{$taxid}{'class_rank'} = $class_rank;
		$taxonomy{$taxid}{'path'} = $path;
	}

	foreach my $tid (@norank) {
		my @parts = split(/:/, $taxonomy{$tid}{'path'});

		pop(@parts);

		foreach my $pid (reverse(@parts)) {
			if ($taxonomy{$pid}{'class_rank'} ne "no rank") {
				$taxonomy{$tid}{'class_rank'} = $taxonomy{$pid}{'class_rank'};
				last;
			}
		}
	}

	return(\%taxonomy);
}
