#!/usr/bin/env perl


# Author : NTM
# Created : 20/01/05
#
#
# Copyright (C) Nicolas Thierry-Mieg, 2006.
#
#
# This file is part of InterPool, written by 
# Nicolas Thierry-Mieg (CNRS, France) Nicolas.Thierry-Mieg@imag.fr
#
# InterPool is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# InterPool is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with InterPool; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA


# Package holds subs to analyze files generated by the C
# program "ipoolSimulations":
#  analyzeCsim analyzes a single file (outputs summary to STDOUT);
#  analyzeCsimDir analyzes all files in current dir (idem, output to STDOUT).
# The subs need a custom sort order, which is defined: NICSORT.
# NOTE 15/09/04: we now have a results file for each (q,k) value,
# so I'm removing the code that took care of dealing with files that
# contained lots of (q,k) values.


use warnings ;
use strict ;


sub analyzeCsim
{
# input: $file
# prints to STDOUT a summary of all the simms in this file
# (ie, average nb of pos, fp, fn, found, ambi, etc...)
# UPDATE 11/03/04: added the calculation of the standard 
# deviation for number of ambis, as well as calculation
# of mean and standard deviation of number of mis-taggings.

    (@_ != 1) && die "analyzeCsim needs one argument: a file name\n" ;
    my ($file) = (@_) ;

    my $qopt ; # optimal q value
    my ($n, $q, $k) ; # STD system used

    my ($totalpos, $totalfp, $totalfn, $declaredTotalSims) ;
    # total number of positive vars, of false pos/neg pools, and of simms
    # all of these are declared in the file headers

    my $totalsim = 0; # number of simms actually found: used for double-checking

    my ($solfound, $dist) = (0,0) ;
    # number of solutions found, and distance from observed sig

    my ($foundpos, $foundneg, $wrongaspos, $wrongasneg, $ambi) = (0,0,0,0,0) ;
    

    open(FILE, "$file") || die "cannot open file $file in analyzeCsim\n" ;
    while(<FILE>)
    {
	chomp ;


	####################################################
	# parse header
	####################################################

	if (/^optimal q is (\d+)$/)
	{
	    $qopt = $1 ;
	    next ;
	}

	elsif (/^using design file=.*\/STD\.n(\d+)\.q(\d+)\.k(\d+)$/)
	{
	    ($n, $q, $k) = ($1, $2, $3) ;
	    next ;
	}
	# for partial design files 
	elsif (/^using design file=STD\.n(\d+)\.q(\d+)\.k(\d+)\.from/)
	{
	    ($n, $q, $k) = ($1, $2, $3) ;
	    next ;
	}


	# for older results: no faints and weaks, just pos and neg
	elsif (/^nb of posVars=(\d+), nb of falsePos=(\d+), falseNeg=(\d+)$/)
	{
	    # old and new formats are exclusive
	    (defined $totalfp) && die "in analyzeCsim, analyzing old style $file: totalfp defined\n" ;
	    (defined $totalfn) && die "in analyzeCsim, analyzing old style $file: totalfn defined\n" ;

	    $totalpos = $1 ;
	    $totalfp = $2 ;
	    $totalfn = $3 ;
	    next ;
	}

	# new results files, with faints and weaks (13/06/06)
	elsif (/^nb of posVars=(\d+), nb of falseStrong=(\d+), falseWeak=(\d+), nb of falseFaint=(\d+), falseNone=(\d+)$/)
	{
	    # old and new formats are exclusive
	    (defined $totalfp) && die "in analyzeCsim, analyzing new style $file: totalfp defined\n" ;

	    $totalpos = $1 ;
	    $totalfp = $2+$3 ;
	    $totalfn = $4+$5 ;
	    next ;
	}

	elsif (/^using costs: NEG==\d+, FAINT==\d+, WEAK==\d+, POS==\d+$/)
	{
	    # skip line
	    next ;
	}

	elsif (/^performing (\d+) simulations$/)
	{
	    $declaredTotalSims = $1 ;
	    next ;
	}

	elsif (/^using as random generator:/)
	{ # just skip this
	    next ;
	}


	####################################################
	# parse simm results
	####################################################

	elsif (/^(\d+) solutions found, at distance (\d+)$/)
	{
	    $solfound += $1 ;
	    $dist += $2 ;
	    $totalsim++ ;
	    next ;
	}

	elsif (/^correctly identified (\d+) positive vars$/)
	{
	    $foundpos += $1 ;
	    next ;
	}
    
	elsif (/^wrongly tagged as negatives (\d+) positive vars$/)
	{
	    $wrongasneg += $1 ;
	    next ;
	}

	elsif (/^correctly identified (\d+) negative vars$/)
	{
	    $foundneg += $1 ;
	    next ;
	}

	elsif (/^wrongly tagged as positives (\d+) negative vars$/)
	{
	    $wrongaspos += $1 ;
	    next ;
	}

	elsif (/^(\d+) vars are ambiguous$/)
	{
	    $ambi += $1 ;
	    next ;
	}


	####################################################
	# end of file: produce beginning of summary
	####################################################

	elsif (/^___DONE___$/)
	{
	    # a batch of simms has been read, print beginning of summary and save averages 
	    # (for standard deviations in the second pass)

	    if ($declaredTotalSims != $totalsim)
	    {
		die "total simms calculated ($totalsim) different from number declared in header!\n" ;
	    }

	    # find the average values

	    $solfound /= $totalsim ;
	    $dist /= $totalsim ;
	    
	    $foundpos /= $totalsim ;
	    $foundneg /= $totalsim ;
	    $wrongaspos /= $totalsim ;
	    $wrongasneg /= $totalsim ;
	    $ambi /= $totalsim ;
	    
	    # now prepare output summary for this batch
	    print "for the setting n=$n, q=$q, k=$k :\n" ;
	    print "using $totalpos pos vars, $totalfp false pos's and $totalfn false negs (optimal q: $qopt)\n" ;
	    print "recovered $foundpos pos's and $foundneg negs, unresolved $ambi\n" ;
	    if ($wrongaspos || $wrongasneg)
	    {
		print "ERRORS: wrongly identified $wrongaspos as pos and $wrongasneg as neg\n" ;
	    }
	    print "found $solfound solutions on average, at distance $dist\n" ;
	    
	    my $nbpools = $q * $k ;
	    print "number of pools: $nbpools\n" ;
	    my $gain = $n / $nbpools ;
	    printf ("Gain is : %.1f\n", $gain) ;
	}
	
	
	# lines to discard (may be present if doSimulation was compiled with some DEBUG on)
	elsif (/^starting sim number /)
	{
	    next ;
	}
	elsif (/^number of positive vars: /)
	{
	    next ;
	}
	elsif (/^false positive: pool \d+ from layer /)
	{
	    next ;
	}
	elsif (/^false negative: pool \d+ from layer /)
	{
	    next ;
	}
	elsif (/^variable .*, but tagged \d+ in mergedDeducedVV$/)
	{
	    next ;
	}
	elsif (/^$/)
	{
	    next ;
	}
	
	else
	{
	    die "could not parse line:\n$_\n" ;
	}	
    }
    
    close(FILE) ;
    

    ############################################################
    # now rescan the whole file to calculate standard deviations
    ############################################################

    # standard deviation is defined as: sqrt{(1/$totalsim)*(SUM(i=1..$totalsim)[(Xi-Xmoy)^2])}

    my ($ambidev, $wrongasposdev, $wrongasnegdev) = (0,0,0) ;
    # when there were no errors, the wrongasX line is not output; so we have to count them
    my ($wrongasposNB, $wrongasnegNB) = (0,0) ;
    open(FILE, "$file") || die "cannot re-open file $file in analyzeCsim\n" ;
    while(<FILE>)
    {
	chomp ;

	if (/^wrongly tagged as negatives (\d+) positive vars$/)
	{
	    $wrongasnegNB++ ;
	    $wrongasnegdev += ($1 - $wrongasneg) * ($1 - $wrongasneg) ;
	}

	elsif (/^wrongly tagged as positives (\d+) negative vars$/)
	{
	    $wrongasposNB++ ;
	    $wrongasposdev += ($1 - $wrongaspos) * ($1 - $wrongaspos) ;
	}

	elsif (/^(\d+) vars are ambiguous$/)
	{
	    $ambidev += ($1 - $ambi) * ($1 - $ambi) ;
	}

	elsif (/^___DONE___$/)
	{
	    # done reading a batch of simms, calculate and output deviations
	    $ambidev /= $totalsim ;
	    $ambidev = sqrt($ambidev) ;
	    
	    # for wrongas*dev: we must add the "0 error" simms that didn't get a printout
	    $wrongasposdev += ($totalsim - $wrongasposNB) * ($wrongaspos * $wrongaspos) ;
	    $wrongasposdev /= $totalsim ;
	    $wrongasposdev = sqrt($wrongasposdev) ;
	    
	    $wrongasnegdev += ($totalsim - $wrongasnegNB) * ($wrongasneg * $wrongasneg) ;
	    $wrongasnegdev /= $totalsim ;
	    $wrongasnegdev = sqrt($wrongasnegdev) ;

	    print "standard dev on ambis: $ambidev\n" ;
	    print "standard dev on wrongAsPos: $wrongasposdev\n" ;
	    print "standard dev on wrongAsNeg: $wrongasnegdev\n" ;
	    print "\n" ;
	}

	else
	{ # syntax has been checked in the first passage, now just skip
	    next ;
	}

    }
    close(FILE) ;
}




sub analyzeCsimDir
{
# analyzes all the simm files (as produced by simulation) in current dir.
# outputs summary to STDOUT.

    opendir(TESTF, ".") || die "cannot open directory ./\n" ;
    my @infiles = grep(/^STD\.n\d/, readdir(TESTF)) ;
    closedir(TESTF) ;
    @infiles = sort (NICSORT @infiles) ;

    foreach my $file (@infiles)
    {
	&analyzeCsim($file) ;
    }
}


sub NICSORT
{
    # a subroutine to sort test files
    #files are currently named: STD.n10000.q17.k12.pos10.fp2.fn0.nsim10000.seed\d+
    # 06/10/23: we now have some design files made from regrouping partial STD 
    # designs, these add some stuff after k\d+\. and before pos
    $a =~ /^STD\.n(\d+)\.q(\d+)\.k(\d+)\..*pos(\d+)\.fp(\d+)\.fn(\d+)\.nsim\d+\.seed\d+$/ 
	|| die "in NICSORT, cannot parse file name a: $a\n" ;
    my ($na,$qa,$ka,$posa,$fpa,$fna) = ($1,$2,$3,$4,$5,$6) ;

    $b =~ /^STD\.n(\d+)\.q(\d+)\.k(\d+)\..*pos(\d+)\.fp(\d+)\.fn(\d+)\.nsim(\d+)\.seed(\d+)$/ 
	|| die "in NICSORT, cannot parse file name b: $b\n" ;
    my ($nb,$qb,$kb,$posb,$fpb,$fnb) = ($1,$2,$3,$4,$5,$6) ;

    ($na > $nb) && return 1 ;
    ($na < $nb) && return -1 ;
    # if we get here then $na==$nb
    ($posa > $posb) && return 1 ;
    ($posa < $posb) && return -1 ;
    # idem pos
    ($qa > $qb) && return 1 ;
    ($qa < $qb) && return -1 ;
    # idem q
    ($ka > $kb) && return 1 ;
    ($ka < $kb) && return -1 ;
    # idem k
    ($fpa > $fpb) && return 1 ;
    ($fpa < $fpb) && return -1 ;
    # idem fp
    ($fna > $fnb) && return 1 ;
    ($fna < $fnb) && return -1 ;
    # idem fn, $a==$b (the files only differ by nsim or seed, order is indifferent)
    return 0 ;
}


# MAIN:
if (@ARGV == 0)
{
    # no arguments: deal with all files in current dir
    &analyzeCsimDir ;
}
elsif (@ARGV == 1)
{
    # a single arg: it should be a single file to analyze
    &analyzeCsim($ARGV[0]) ;
}
else
{
    die "wrong number of args.\nUse no args to analyze all files in current dir, or a single filename to analyze it\n" ;
}
