use lib "$ENV{FIREDIR}/SCRIPTS";

# perl read_crossvalidation_results.pl --train_summaryfile=human_test_1_FIRE/DNA/CV/human_test_1.CV0.tra_FIRE/DNA/human_test_1.CV0.tra.summary --train_matfile=human_test_1_FIRE/DNA/CV/human_test_1.CV0.tes_FIRE/DNA/human_test_1.CV0.tes.summary --test_summaryfile=human_test_1_FIRE/DNA/CV/human_test_1.CV0.tra_FIRE/DNA/human_test_1.CV0.tra.matrix --test_matfile=human_test_1_FIRE/DNA/CV/human_test_1.CV0.tes_FIRE/DNA/human_test_1.CV0.tes.matrix --zThreshold=0 --corrThreshold=-1 

# perl read_crossvalidation_results.pl --train_summaryfile=human_test_1_FIRE/DNA/CV/human_test_1.CV1.tra_FIRE/DNA/human_test_1.CV1.tra.summary --test_summaryfile=human_test_1_FIRE/DNA/CV/human_test_1.CV1.tes_FIRE/DNA/human_test_1.CV1.tes.summary --train_matfile=human_test_1_FIRE/DNA/CV/human_test_1.CV1.tra_FIRE/DNA/human_test_1.CV1.tra.matrix --test_matfile=human_test_1_FIRE/DNA/CV/human_test_1.CV1.tes_FIRE/DNA/human_test_1.CV1.tes.matrix --zThreshold=0 --corrThreshold=-1 

use Table;
use Sets;
use Getopt::Long;

use strict;

my $scriptdir      = "$ENV{FIREDIR}/SCRIPTS";
my $progdir        = "$ENV{FIREDIR}/PROGRAMS";

my $outfile		= undef;
my $train_summaryfile	= undef;
my $test_summaryfile	= undef;
my $train_matfile	= undef;
my $test_matfile	= undef;
my $zThreshold		= undef;
my $corrThreshold	= undef;

if (@ARGV == 0) {
  die "Usage: perl read_crossvalidation_results.pl --train_summaryfile=FILE --test_summaryfile=FILE  --train_matfile=FILE --test_matfile=FILE --zThreshold=FLOAT --corrThreshold=FLOAT --outfile=FILE\n";
}

##	
##	Read training and testing summary files from crossvalidation
##	Keep only the motifs that are above a certain z-score threshold in test set
##	Keep only the ones that have a certain rank correlation between training and testing (show the same trend in over-under representations)
##	Print files for the next step
##	

GetOptions (
		'train_summaryfile=s'	=> \$train_summaryfile,
		'test_summaryfile=s'	=> \$test_summaryfile,
		'train_matfile=s'	=> \$train_matfile,
		'test_matfile=s'	=> \$test_matfile,
		'outfile=s'		=> \$outfile,
		'zThreshold=s'		=> \$zThreshold,
		'corrThreshold=s'	=> \$corrThreshold
		);


if (! -e $train_summaryfile) 
	{
	print "In read_crossvalidation_results.pl:\t$train_summaryfile does not exist.\n";
	exit 1;
	}

if (! -e $test_summaryfile) 
	{
	print "In read_crossvalidation_results.pl:\t$test_summaryfile does not exist.\n";
	exit 1;
	}


my $ta = Table->new;


#####################################################
###########  READ THE TRAINING FILES  ###############
#####################################################


#
#  read in the training set matrix file
#

my @TRAIN_MOTIFS = ();

$ta->loadFile($train_matfile);
my $a_train_ref_M      = $ta->getArray();
@TRAIN_MOTIFS = @{ $ta->getColumn(0) };
shift @TRAIN_MOTIFS;
my $a_train_ref_H = shift @$a_train_ref_M; 
shift @$a_train_ref_H;
my %TRAIN_MATRIX       = ();
foreach my $r (@$a_train_ref_M) 
	{
	my $m = shift @$r;
	$TRAIN_MATRIX{ $m } = $r;
	}

#
#  read in the training summary file
#
$ta->loadFile($train_summaryfile);
my $a_train_ref_mo = $ta->getArray();

my %TRAIN_STAT         = ();
my %TRAIN_MOTIF_NUMBER = ();

my %train_motif_name = ();

my $train_cnt = 0;
foreach my $r (@$a_train_ref_mo) 
	{
	my %a_tmp = (	"RNA"    => $r->[1],
			"COPIES" => $r->[2],
			"MI"     => $r->[3],
			"RANK"   => $r->[4],
			"Z"      => $r->[5],
			"R"      => $r->[6],
			"S"      => $r->[7],
			"SEED"   => $r->[8],
			"DIST"   => $r->[9],
			"ORIE"   => $r->[10],
			"CONS"   => $r->[11],
			);
	$TRAIN_STAT{$r->[0]}		= \%a_tmp;
	$TRAIN_MOTIF_NUMBER{$r->[0]}	= $train_cnt ++;
	$train_motif_name{$r->[0]}	= $r->[8];
	if (($r->[8] eq '') || ($r->[8] eq '0')) 
		{
		$train_motif_name{$r->[0]} = $r->[0];
		}
	}


#####################################################
###########  READ THE TESTING FILES  ###############
#####################################################

#
#  read in the testing set matrix file
#

my @TEST_MOTIFS = ();

$ta->loadFile($test_matfile);
my $a_test_ref_M      = $ta->getArray();
@TEST_MOTIFS = @{ $ta->getColumn(0) };
shift @TEST_MOTIFS;
my $a_test_ref_H = shift @$a_test_ref_M; 
shift @$a_test_ref_H;
my %TEST_MATRIX       = ();
foreach my $r (@$a_test_ref_M) 
	{
	my $m = shift @$r;
	$TEST_MATRIX{ $m } = $r;
	}



#
#  read in the testing summary file
#
$ta->loadFile($test_summaryfile);
my $a_test_ref_mo = $ta->getArray();

my %TEST_STAT         = ();
my %TEST_MOTIF_NUMBER = ();

my %test_motif_name = ();

my $test_cnt = 0;
foreach my $r (@$a_test_ref_mo) 
	{
	my %a_tmp = (	"RNA"    => $r->[1],
			"COPIES" => $r->[2],
			"MI"     => $r->[3],
			"RANK"   => $r->[4],
			"Z"      => $r->[5],
			"R"      => $r->[6],
			"S"      => $r->[7],
			"SEED"   => $r->[8],
			"DIST"   => $r->[9],
			"ORIE"   => $r->[10],
			"CONS"   => $r->[11],
			);
	$TEST_STAT{$r->[0]}		= \%a_tmp;
	$TEST_MOTIF_NUMBER{$r->[0]}	= $test_cnt ++;
	$test_motif_name{$r->[0]}	= $r->[8];
	if (($r->[8] eq '') || ($r->[8] eq '0')) 
		{
		$test_motif_name{$r->[0]} = $r->[0];
		}
	}




#####################################################
################  FILTER MOTIFS  ####################
#####################################################

my @PASS_MOTIFS = ();

foreach my $mot (@TEST_MOTIFS) 
	{

	my $pv_tes = $TEST_MATRIX{ $mot };  	# get testing pvalues 
	my $pv_tra = $TRAIN_MATRIX{ $mot };  	# get training pvalues
	my @pvalues_tes = @$pv_tes;
	my @pvalues_tra = @$pv_tra;

	my $lptes = - Sets::log10(0.05);
	my $lptra  = - Sets::log10(0.1/scalar(@TEST_MOTIFS));
	if($lptra<$lptes) { $lptra = $lptes; }
	for (my $ix=0; $ix<@pvalues_tes; $ix++) 
		{ 
		if($pvalues_tes[$ix] =~ m/nan/) { $pvalues_tes[$ix]=0; } 
		if(abs($pvalues_tes[$ix])<$lptes) { $pvalues_tes[$ix]=0; }  # elsif ($pvalues_tes[$ix]>$lptes) {$pvalues_tes[$ix]=1;} else {$pvalues_tes[$ix]=-1;}
		}
	for (my $ix=0; $ix<@pvalues_tra; $ix++) 
		{ 
		if($pvalues_tra[$ix] =~ m/nan/) { $pvalues_tra[$ix]=0; } 
		if(abs($pvalues_tra[$ix])<$lptra) { $pvalues_tra[$ix]=0;}   # elsif ($pvalues_tra[$ix]>$lptra) {$pvalues_tra[$ix]=1;} else {$pvalues_tra[$ix]=-1;}
		}
	my $test_zscore = $TEST_STAT{$mot}->{Z};
	my $train_zscore = $TRAIN_STAT{$mot}->{Z};
	my $cc = 0; my $ss = 0;
	for (my $ix=0; $ix<@pvalues_tra; $ix++) {  $ss += $pvalues_tra[$ix]*$pvalues_tes[$ix]; if($pvalues_tra[$ix]!=0) { $cc++; } }
	if($cc>0) { $ss /= $cc;  }

	my $rank_tes = Sets::ranks(\@pvalues_tes);
	my $rank_tra = Sets::ranks(\@pvalues_tra);
	my $rank_corr = Sets::pearson($rank_tra, $rank_tes);
	my $pear_corr = Sets::pearson(\@pvalues_tes, \@pvalues_tra);

	print "$mot\t\(train-zScore\)\ $train_zscore\t\(test-zScore\)\ $test_zscore\t\(spear-corr\)\ $rank_corr\t\(pear-corr\) $pear_corr"; #print "\n";
#	print "$mot"; foreach my $c (@pvalues_tes) { print "\t$c"; } 
#	print "$mot"; foreach my $c (@$rank_tes)   { print "\t$c"; } print "\t\t\t$test_zscore\t$rank_corr\n";
#	print "$mot"; foreach my $c (@$rank_tra)   { print "\t$c"; } print "\n";
#	print "$mot"; foreach my $c (@pvalues_tra) { print "\t$c"; } print "\n\n";

	if (($test_zscore>$zThreshold) && ($rank_corr>$corrThreshold))
		{
		push @PASS_MOTIFS, $mot;
		print "\tPASS!\n";
		}
	else
		{
		print "\tNO-PASS\n";
		}
	}

if(scalar(@PASS_MOTIFS)>0)
{
open (OUT, ">". $outfile) || die "Can't open " . $outfile;
foreach my $mot (@PASS_MOTIFS) 
	{
	print OUT "$mot\n";
	}
close OUT;
}







