/**
*	SIGRS - Identifying genomic regions of contrasting composition using a partial sum process
*	Copyright (C) 2008 Pontus Larsson
*	 
*	This file is part of SIGRS.
*	  
*	SIGRS is free software: you can redistribute it and/or modify
*	it under the terms of the GNU General Public License as published by
*	the Free Software Foundation, either version 3 of the License, or
*	(at your option) any later version.
*
*	SIGRS is distributed in the hope that it will be useful,
*	but WITHOUT ANY WARRANTY; without even the implied warranty of
*	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*	GNU General Public License for more details.
*
*	You should have received a copy of the GNU General Public License
*	along with SIGRS. If not, see <http://www.gnu.org/licenses/>.
*/ 

package SIGRS;

      
/**
*	SIGRS is a collection of routines used in searching for regions of contrasting composition (CCRs) in sequence files using a partial sum process.
*	Significance of segments is evaluated using Karlin-Altschul statistics and specifically an extension by Karlin-Dembo allowing
*	for nucleotides to have a Markov-dependence (see e.g. <a href="http://www.pnas.org/cgi/reprint/90/12/5873" target="_blank">Karlin & Altschul (1993)</a> and
*	<a href="http://www.jstor.org/view/00018678/ap050087/05a00070/0" target="_blank">Karlin & Dembo (1992)</a>
*	<P>
*	The routines are provided as is and no guarantee regarding stability etc. is given so use at your own risk!
*	<P>
*	<P>
*	See publication <b>Larsson, P., Hinas, A., Ardell, D.H., Kirsebom, L.A., Virtanen, A. and Sderbom, F.</b> <i>De novo search for non-coding RNA genes in the AT-rich genome of Dictyostelium discoideum: performance of
*	Markov-dependent genome feature scoring</i>
*	<P>
*	Questions and comments can be directed to <a href="mailto:Pontus.Larsson@icm.uu.se">Pontus.Larsson@icm.uu.se</a>
*	@author Pontus Larsson
*
*/

public class KAStatistics {

	/**
	*	Determines the bit score cutoff at significance level alpha.
	*	According to [1] in <a href="http://www.pnas.org/cgi/reprint/90/12/5873" target="_blank">Karlin & Altschul (1993)</a>
	*	solved for x at Prob(S'>=x) = alpha
	*	@param alpha The significance level
	*	@param N The search space size
	*	@param K The parameter K
	*	@param L The parameter lambda
	*	@return The bit score cutoff at the desired significance level
	*/
	public static double cutoff(double alpha, int N, double K, double L) {
		double x0 = -1.*Math.log(-1.*Math.log(1.-alpha));
		return denormalizeScore(x0,N,K,L);
	}
	/**
	*	Gets the bit score from a normalized score ccording to Karlin & Altschul (1993)
	*/
	private static double denormalizeScore(double score, int N, double K, double L) {return ((score+Math.log(K*N))/L);}

	/**
	*	Calculates the entropy of a scoring matrix
	*	@param s The input score matrix
	*	@param p The probabilities associated with the score matrix
	*	@param L The estimated lambda parameter for s and p
	*	@return The calculated entropy
	*/
	public static double entropy(double[][] s, double[][] p, double L) {
		double[][] r = reshape(s,p);
		double H = 0;
		for (int i=0; i<r.length; i++)
			H += L*r[i][0]*r[i][1]*Math.exp(L*r[i][0]);
		return H;
	}

	/**
	*	Estimates the parameter K for the independant nucleotides case
	*	For details, see page 8 in
	*	<a href="ftp://ftp.ncbi.nlm.nih.gov/blast/documents/developer/scoring.pdf" target="_blank">BLAST scoring parameters</a>
	*	@param s Score matrix
	*	@param p Probability matrix associated with scores
	*	@param L The estimated lambda
	*	@param H The calculated entropy of the score matrix
	*	@return An estimated value for the parameter K
	*/
	public static double estimateK(double[][] s, double[][] p, double L, double H) {
		double K = 0;
		// Determine the maximum and minimum scores
		double[][] r = reshape(s,p);

		// delta is the greatest common divisor of the scores.
		double delta = gcd(r);
		// Special case 1: u=delta AND l=-1*delta (with non-zero probability)
		if (r[0][0] == (-1.*delta) && r[0][1] > 0 && r[r.length-1][0] == delta && r[r.length-1][1] > 0)
			K = ((r[r.length-1][1] - r[0][1])*(r[r.length-1][1] - r[0][1]))/r[0][1];
		// Special case 2: u=delta AND l!=-1*delta (with non-zero probability)
		else if (r[r.length-1][0] == delta && r[r.length-1][1] > 0)
			K = (H/(delta*L))*(1-Math.exp(-1.*delta*L));
		// Special case 3: u!=delta AND l=-1*delta (with non-zero probability)
		else if (r[0][0] == (-1.*delta) && r[0][1] > 0) {
			double sum = 0;
			for (int i=0; i<r.length; i++)
				sum += r[i][0]*r[i][1];
			K = ((L*(1-Math.exp(-1.*delta*L)))/(delta*H))*(sum*sum);
		}
		// No more special cases
		else {
			// The sigma function approximates an infinite sum
			double sigma = sigma(s,p,L);
			K = (delta*L*Math.exp(-2*sigma))/(H*(1-Math.exp(-1.0*L*delta)));
		}
		return K;
	}

	private static double gcd(double[][] r) {
		int i = 0;
		while (i < r.length && r[i][1] == 0)
			i++;
		double gcd = r[i][0];
		for (i++; i<r.length; i++) {
			if (r[i][1] > 0)
				gcd = gcd(gcd,r[i][0]);
		}
		return gcd;
	}

	/**
	*	Finds the greatest common divisor of two numbers using the Euclidian algorithm
	*	@param a
	*	@param b
	*	@return The greatest common divisor of a and b
	*/
	public static double gcd(double a, double b) {
		a = Math.abs(a);
		b = Math.abs(b);
		if (a == 0)
			return b;
		if (b == 0)
			return a;
		double gcd = 1.;
		double rem;
		while ((rem = a%b) != 0) {
			a = b;
			b = rem;
		}
		gcd = b;
		return gcd;
	}

	/**
	*	Used in estimation of K. Approximates the infinite sum sigma. Stops
	*	when terms are smaller than limit. The stability of this iteration
	*	is NOT guaranteed
	*/
	private static double sigma(double[][] s, double[][] prob, double lambda) {
		// reshape(..) returns a matrix where each score and the total probability
		// of observing that score is stored
		double[][] r = reshape(s,prob);

		// Stopping criterion
		double limit = 1e-4;
		double diff = 2*limit;
		double oldSum;
		int index;
		int indexI;
		int indexK;
		int iteration=1;

		int minStep = (int) r[0][0];
		int maxStep = (int) r[r.length-1][0];
		int lower = minStep;
		int upper = maxStep;
		int range = upper-lower+1;

		double[] p = Methods.getColumn(1,r);
		double[] oldP;

		double sum = 0;
		for (int i=lower; i<=upper; i++) {
			if (i < 0)
				sum += p[i-lower]*Math.exp(i*lambda);
			else
				sum += p[i-lower];
		}
		while (Math.abs(diff) > limit) {
			oldSum = sum;
			sum = 0;
			oldP = p;
			lower += minStep;
			upper += maxStep;
			range = upper-lower+1;
			p = new double[range];
			for (int k=0; k<r.length; k++)
				for (int j=0; j<oldP.length; j++)
					p[j+k] += oldP[j]*r[k][1];
			for (int i=lower; i<=upper; i++) {
				if (i < 0)
					sum += p[i-lower]*Math.exp(i*lambda);
				else
					sum += p[i-lower];
			}
			iteration++;
			sum = sum/iteration;
			sum += oldSum;
			diff = sum-oldSum;
		}
		return sum;
	}


	/**
	*	Calculates the expect value of a score according to <a href="http://www.pnas.org/cgi/reprint/90/12/5873" target="_blank">Karlin & Altschul (1993) p.5875</a>
	*	[y=K*N*exp(-lambda*x)]
	*	@param score The bit score to calculate expect value for
	*	@param N The search space size
	*	@param K The parameter K
	*	@param L The parameter lambda
	*	@return The calculated expect value for the input score
	*/
	public static double expect(double score, double N, double K, double L) {return (K*N*Math.exp(-1.*score*L));}

	/**
	*	Estimates lambda by an iterative Newton-Rhapson until convergence
	*	Terminates execution if convergence is not reached within 10000 iterations
	*	@param s Score matrix
	*	@param p Probability matrix associated with scores
	*	@return An estimated value of lambda for the score matrix
	*/
	public static double lambda(double[][] s, double[][] p) {
		double limit = 1e-8;
		// Initial guess
		double L = 0.5;
		// Maximum number of iterations
		int maxN = 10000;
		int n = 0;
		double diff = 1;
		double lambdaOld;
		while (diff > limit && n < maxN) {
			lambdaOld = L;
			L = lambdaOld - (f(p,s,lambdaOld)-1.0)/df(p,s,lambdaOld);
			n++;
			diff = Math.abs(L-lambdaOld);
		}
		// Stop execution if not converged in maxN iterations
		if (n == maxN) {
			String message = new String("Maximum number of iterations reached!\n");
			message += new String("\tCurrent lambda = "+String.valueOf(L)+"\n");
			message += new String("\tDelta = "+String.valueOf(diff)+"\n");
			System.out.println(message);
			System.exit(0);
		}
		return L;
	}

	/**
	*	Used by lambda() function for Newton-Rhapson
	*/
	private static double f(double[][] p, double[][] s, double lambda) {
		double f = 0;
		for (int i=0; i<p.length; i++)
			for (int j=0; j<p[i].length; j++)
				f += p[i][j]*Math.exp(lambda*s[i][j]);
		return f;
	}
	/**
	*	Used by lambda() function for Newton-Rhapson
	*/
	private static double df(double[][] p, double[][] s, double lambda) {
		double f = 0;
		for (int i=0; i<p.length; i++)
			for (int j=0; j<p[i].length; j++)
				f += p[i][j]*s[i][j]*Math.exp(lambda*s[i][j]);
		return f;
	}

	/**
	*	Returns a matrix spanning the scores in s where the first column
	*	of each row is a score and the second column is the total
	*	probability of observing that score
	*/
	public static double[][] reshape(double[][] s, double[][] p) {
		int min = (int) Methods.min(s);
		int max = (int) Methods.max(s);
		int len = max-min+1;
		double[][] reshaped = new double[len][2];
		for (int i=0; i<len; i++) {
			reshaped[i][0] = 1.*(min+i);
			for (int j=0; j<s.length; j++)
				for (int k=0; k<s[j].length; k++)
					if (s[j][k] == reshaped[i][0])
						reshaped[i][1] += p[j][k];
		}
		return reshaped;
	}
}
