/**
*	SIGRS - Identifying genomic regions of contrasting composition using a partial sum process
*	Copyright (C) 2008 Pontus Larsson
*	 
*	This file is part of SIGRS.
*	  
*	SIGRS is free software: you can redistribute it and/or modify
*	it under the terms of the GNU General Public License as published by
*	the Free Software Foundation, either version 3 of the License, or
*	(at your option) any later version.
*
*	SIGRS is distributed in the hope that it will be useful,
*	but WITHOUT ANY WARRANTY; without even the implied warranty of
*	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*	GNU General Public License for more details.
*
*	You should have received a copy of the GNU General Public License
*	along with SIGRS. If not, see <http://www.gnu.org/licenses/>.
*/   

package SIGRS;

import java.util.Vector;

import Jama.EigenvalueDecomposition;
import Jama.Matrix;
    
/**
*	SIGRS is a collection of routines used in searching for regions of contrasting composition (CCRs) in sequence files using a partial sum process.
*	Significance of segments is evaluated using Karlin-Altschul statistics and specifically an extension by Karlin-Dembo allowing
*	for nucleotides to have a Markov-dependence (see e.g. <a href="http://www.pnas.org/cgi/reprint/90/12/5873" target="_blank">Karlin & Altschul (1993)</a> and
*	<a href="http://www.jstor.org/view/00018678/ap050087/05a00070/0" target="_blank">Karlin & Dembo (1992)</a>
*	<P>
*	The routines are provided as is and no guarantee regarding stability etc. is given so use at your own risk!
*	<P>
*	<P>
*	See publication <b>Larsson, P., Hinas, A., Ardell, D.H., Kirsebom, L.A., Virtanen, A. and Sderbom, F.</b> <i>De novo search for non-coding RNA genes in the AT-rich genome of Dictyostelium discoideum: performance of
*	Markov-dependent genome feature scoring</i>
*	<P>
*	Questions and comments can be directed to <a href="mailto:Pontus.Larsson@icm.uu.se">Pontus.Larsson@icm.uu.se</a>
*	@author Pontus Larsson
*
*/

public class KDStatistics {

	/**
	*	Calculates the expected score based on a score matrix and the associated probabilities
	*	@param s The score matrix
	*	@param p The associated probabilities
	*	@return The expected score
	*/
	public static double expect(double[][] s, double[][] p)	{
		double[] pi = stationaryDistribution(p);
		double E = 0;
		for (int i=0; i<s.length; i++)
			for (int j=0; j<s[i].length; j++)
				E += pi[i]*s[i][j]*p[i][j];
		return E;
	}

	/**
	*	Estimates the parameter K for a score matrix assuming Markov-dependant letters.
	*	Follows the steps on pages 137-139 in
	*	<a href="http://www.jstor.org/view/00018678/ap050087/05a00070/0" target="_blank">
	*	Karlin & Dembo (1992)</a>. Not much care is taken to make sure iterations
	*	converge and that exceptions don't occur.
	*	@param theta The estimated theta for the score matrix
	*	@param u The right frequency eigenvector of PHI(theta)
	*	can be (obtained from the theta estimation)
	*	@param s The score matrix
	*	@param p The associated probabilities
	*	@return The estimate for the parameter K
	*/
	public static double K(double theta, double[] u, double[][] s, double[][] p)
	{
		// Check for special cases
		double[][] r = KAStatistics.reshape(s,p);
		// Special case 1: l = -1 AND u = 1 (with non-zero probability)
		if (r[0][0] == -1. && r[0][1] > 0 && r[r.length-1][0] == 1. && r[r.length-1][1] > 0)
			System.out.println("***** CAUTION: This is a special case with minimum score = -1 and maximum score = 1. Calculated K for M1 may not be correct *****");
		// Special case 2: u = 1 (with non-zero probability)
		else if (r[r.length-1][0] == 1. && r[r.length-1][1] > 0)
			System.out.println("***** CAUTION: This is a special case with maximum score = 1. Calculated K for M1 may not be correct *****");
		// Special case 3: l = -1 (with non-zero probability)
		else if (r[0][0] == -1. && r[0][1] > 0)
			System.out.println("***** CAUTION: This is a special case with minimum score = -1. Calculated K for M1 may not be correct *****");

		int m = (int) Methods.min(s);
		int n = (int) Methods.max(s);
		int rows = s.length;
		int cols = s[0].length;
		int[] steps = new int[n-m+1];
		for (int i=m; i<=n; i++)
			steps[i-m] = i;
		int zeroIndex = 0-m;

		// Arrays holding negative and positive integers, respectively ranging
		// between 0 and the minimum and maximum score, respectively.
		int[] negativeValues = Methods.subarray(0,zeroIndex,steps);
		int[] positiveValues = Methods.subarray(zeroIndex+1,steps);

		// Arrays holding the possible positive and negative scores
		int[] negativeSteps = new int[0];
		int[] positiveSteps = new int[0];
		for (int i=0; i<rows; i++)
		{
			for (int j=0; j<cols; j++)
			{
				if (s[i][j] < 0 && Methods.indexOf((int) s[i][j],negativeSteps) < 0)
					negativeSteps = Methods.addElement((int) s[i][j],negativeSteps);
				else if (s[i][j] > 0 && Methods.indexOf((int) s[i][j],positiveSteps) < 0)
					positiveSteps = Methods.addElement((int) s[i][j],positiveSteps);
			}
		}
		// Sort the possible scores
		negativeSteps = Methods.quickSort(negativeSteps);
		positiveSteps = Methods.quickSort(positiveSteps);

		// Split the matrix P into one matrix for each possible transition (K-D (1992) p. 137)
		Matrix[] P = splitP(steps,s,p);
		// Compute the Phats on p. 138
		Matrix[] Phat = pHat(P,zeroIndex);
		// Do the corresponding for T
		Matrix[] T = T(theta,steps,u,P);
		Matrix[] That = T(theta,steps,u,Phat);

		boolean[][] flags = new boolean[rows][cols];
		for (int i=0; i<rows; i++)
			for (int j=0; j<cols; j++)
				flags[i][j] = false;

		// A tree structure will keep track of all possible transitions
		KDTreeStructure[] Qtree = buildTree(negativeValues,n,rows,cols);
		// Perform the iteration to determine the approximate infinite sum Q
		Matrix[] Q = Q(new Matrix[0],Phat,1,negativeValues,positiveSteps,Qtree,flags);
		Matrix Qsum = new Matrix(rows,cols);
		for (int i=0; i<Q.length; i++)
			Qsum.plusEquals(Q[i]);

		double[] z = stationaryDistribution(Qsum.getArray());

		// Do the same thing to determine G but reverse the positive and negative scores
		negativeValues = Methods.reverse(Methods.vectorMultiply(positiveValues,-1));
		positiveSteps = Methods.reverse(Methods.vectorMultiply(negativeSteps,-1));

		for (int i=0; i<rows; i++)
			for (int j=0; j<cols; j++)
				flags[i][j] = false;

		for (int i=0; i<steps.length; i++)
			T[i] = That[steps.length-(i+1)];
		That = T;
		KDTreeStructure[] Gtree = buildTree(negativeValues,-1*m,rows,cols);
		Matrix[] G = Q(new Matrix[0],That,1,negativeValues,positiveSteps,Gtree,flags);
		Matrix Gsum = new Matrix(rows,cols);
		for (int i=0; i<G.length; i++)
			Gsum.plusEquals(G[i]);

		double[] w = stationaryDistribution(Gsum.getArray());

		Gsum = new Matrix(rows,cols);
		for (int i=0; i<G.length; i++)
			Gsum.plusEquals(G[i].times(Math.exp(-1.0*theta*(i+1))));
		Gsum = Matrix.identity(rows,cols).minus(Gsum);
		double[] uInv = new double[u.length];
		for (int i=0; i<u.length; i++)
			uInv[i] = 1.0/u[i];
		double[] cNum = Gsum.times(new Matrix(uInv,cols)).getColumnPackedCopy();

		Gsum = new Matrix(rows,cols);
		for (int i=0; i<G.length; i++)
			Gsum.plusEquals(G[i].times(i+1));
		Matrix e = new Matrix(4,1,1.);
		double[] cDen = Gsum.times(e).getColumnPackedCopy();

		double c = (Methods.vectorInnerProduct(w,cNum)/Methods.vectorInnerProduct(w,cDen))*(1.0/(Math.exp(theta)-1));

		Qsum = new Matrix(rows,cols);
		for (int i=0; i<Q.length; i++)
			Qsum.plusEquals(Q[i].times(Math.exp(theta*(m+i))));
		Qsum = Matrix.identity(rows,cols).minus(Qsum);
		double[] vNum = Qsum.times(new Matrix(u,cols)).getColumnPackedCopy();

		Qsum = new Matrix(rows,cols);
		for (int i=0; i<Q.length; i++)
			Qsum.plusEquals(Q[i].times(m+i));
		double[] vDen = Qsum.times(e).getColumnPackedCopy();

		double E = expect(s,p);
		double v = (Methods.vectorInnerProduct(z,vNum)/Methods.vectorInnerProduct(z,vDen))*E;

		double K = v*c;

		return K;
	}
	/**
	*	Splits the probability matrix into individual matrices for each score
	*/
	private static Matrix[] splitP(int[] steps, double[][] s, double[][] p) {
		int rows = s.length;
		int cols = s[0].length;
		Matrix[] pSplit = new Matrix[steps.length];
		for (int i=0; i<steps.length; i++) {
			pSplit[i] = new Matrix(rows,cols);
			for (int j=0; j<rows; j++)
				for (int k=0; k<cols; k++)
					if (s[j][k] == steps[i])
						pSplit[i].set(j,k,p[j][k]);
		}
		return pSplit;
	}
	/**
	*	Calculate the Phats
	*/
	private static Matrix[] pHat(Matrix[] p, int zeroIndex) {
		int sz = p.length;
		int rows = p[0].getRowDimension();
		int cols = p[0].getColumnDimension();
		Matrix[] pHat = new Matrix[sz];
		for (int i=0; i<sz; i++)
		{
			pHat[i] = Matrix.identity(rows,cols);
			pHat[i] = pHat[i].minus(p[zeroIndex]);
			pHat[i] = pHat[i].inverse();
			pHat[i] = pHat[i].times(p[i]);
		}
		return pHat;
	}
	/**
	*	Split and calculate the T's and later the Thats
	*/
	private static Matrix[] T(double theta, int[] positiveSteps, double[] u, Matrix[] P) {
		Matrix[] T = new Matrix[positiveSteps.length];
		double[][] d = new double[u.length][u.length];
		for (int i=0; i<u.length; i++)
			d[i][i] = u[i];
		Matrix D = new Matrix(d);
		Matrix Di = D.inverse();
		for (int i=0; i<positiveSteps.length; i++)
			T[i] = Di.times(P[i]).times(D).times(Math.exp(theta*positiveSteps[i]));
		return T;
	}
	/**
	*	Builds up the tree structure that will keep track of transitions
	*/
	private static KDTreeStructure[] buildTree(int[] steps, int n, int rows, int cols) {
		KDTreeStructure[] tree = new KDTreeStructure[steps.length];
		for (int i=0; i<steps.length; i++)
		{
			tree[i] = new KDTreeStructure();
			tree[i].setRoot(new KDTreeNode(steps[i],Matrix.identity(rows,cols)));
			if (i > 0)
				tree[i].buildTree(steps,n,tree[i-1]);
			else
				tree[i].buildTree(steps,n,null);
		}
		return tree;
	}
	/**
	*	Estimates the infinite sum Q
	*/
	private static Matrix[] Q(Matrix[] q, Matrix[] pHat, int k, int[] negativeValues, int[] positiveSteps, KDTreeStructure[] tree, boolean[][] flags) {
		// Convergence threshold
		double limit = 1e-8;
		if (k == 1) {
			q = new Matrix[negativeValues.length];
			for (int i=0; i<q.length; i++)
				q[i] = pHat[i].copy();
			return Q(q,pHat,k+1,negativeValues,positiveSteps,tree,flags);
		}

		int rows = q[0].getRowDimension();
		int cols = q[0].getColumnDimension();
		Matrix[] Q = new Matrix[q.length];
		Matrix innerSum;
		Matrix qSum = new Matrix(rows,cols);
		double diff = 0;
		double sum;
		double[][] qV;
		tree = updateTree(tree,q,negativeValues);

		for (int i=0; i<negativeValues.length; i++) {
			Q[i] = pHat[i].copy();
			innerSum = new Matrix(rows,cols);
			for (int j=0; j<positiveSteps.length; j++)
				innerSum = innerSum.plus(pHat[negativeValues.length+positiveSteps[j]].times(tree[i].getPath(positiveSteps[j],negativeValues[i],positiveSteps[positiveSteps.length-1])));

			Q[i] = Q[i].plus(innerSum);
			qSum = qSum.plus(Q[i]);
		}
		qV = qSum.getArray();
		diff = Double.NEGATIVE_INFINITY;
		for (int i=0; i<rows; i++) {
			sum = 0;
			for (int j=0; j<cols; j++) {
				sum += qV[i][j];
				if (qV[i][j] > 0)
					flags[i][j] = true;
			}
			diff = Math.max(1.0-sum,diff);
		}
		if (Math.abs(diff) <= limit)
			return Q;

		return Q(Q,pHat,k+1,negativeValues,positiveSteps,tree,flags);
	}
	/**
	*	Updates the values of the tree nodes
	*/
	private static KDTreeStructure[] updateTree(KDTreeStructure[] tree, Matrix[] Q, int[] labels) {
		Vector visited = new Vector();
		for (int i=0; i<tree.length; i++)
			tree[i].reset(visited);
		for (int i=0; i<tree.length; i++)
			tree[i].updateTree(labels,Q);
		return tree;
	}
	/**
	*	Finds the stationary distribution vector, w, of matrix P according to
	*	 w*P = w and sum(w) = 1
	*/
	private static double[] stationaryDistribution(double[][] p) {
		double[] w = new double[p[0].length];
		Matrix P0 = new Matrix(p);
		Matrix Pn = P0.copy();
		Matrix Pt;
		double tolerance = 1e-8;
		double diff = Double.MAX_VALUE;

		while(diff > tolerance)	{
			Pt = Pn.copy();
			Pn = Pn.times(P0);
			p = Pn.minus(Pt).getArray();
			diff = Math.max(-1.0*Methods.min(p),Methods.max(p));
		}

		p = Pn.getArray();
		w = p[0];

		return w;
	}



	/**
	*	Estimateds the parameter theta* of Step 1 on p. 137 of
	*	<a href="http://www.jstor.org/view/00018678/ap050087/05a00070/0" target="_blank">
	*	Karlin & Dembo (1992)</a> Includes a simple routine for numerical approximation.
	*	No special care is taken to make sure it converges!
	*	@return An array with the estimated theta at first position and the right
	*	frequency eigenvector u of step 2 in the remaining positions
	*/
	public static double[] theta(double[][] s, double[][] p) {
		// Convergence threshold
		double limit = 1e-8;
		// Initial guess
		double guess = 0.075;
		// Adaptive step sizes
		double stepUp = 2.5;
		double stepDown = .5;
		int m = s.length;
		int n = s[0].length;
		double[][] fiD = new double[m][n];
		double target = 1.;
		double rho = 0;
		double diff;
		double[] eigV;
		Matrix fi;
		EigenvalueDecomposition eig = null;
		while ((diff = Math.abs(rho-target)) > limit) {
			// Adjust the step sizes based on how far off the last iteration was
			stepUp = 1.+Math.min(1.,diff);
			stepDown = 1.-Math.min(.5,diff);
			if (rho < target)
				guess *= stepUp;
			else
				guess *= stepDown;

			// Calculate PHI based on the last value of theta
			for (int i=0; i<m; i++)
				for (int j=0; j<n; j++)
					fiD[i][j] = p[i][j]*Math.exp(guess*s[i][j]);
			fi = new Matrix(fiD);
			eig = fi.eig();
			// Determine the spectral radius of PHI(theta)
			rho = Methods.max(eig.getRealEigenvalues());
		}
		fi = eig.getV();
		eigV = Methods.subarray(0,m,fi.getColumnPackedCopy());
		eigV = Methods.vectorDivide(eigV,Methods.sum(eigV));
		return Methods.append(eigV,new double[] {guess});
	}

}
