/**
*	SIGRS - Identifying genomic regions of contrasting composition using a partial sum process
*	Copyright (C) 2008 Pontus Larsson
*	 
*	This file is part of SIGRS.
*	  
*	SIGRS is free software: you can redistribute it and/or modify
*	it under the terms of the GNU General Public License as published by
*	the Free Software Foundation, either version 3 of the License, or
*	(at your option) any later version.
*
*	SIGRS is distributed in the hope that it will be useful,
*	but WITHOUT ANY WARRANTY; without even the implied warranty of
*	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*	GNU General Public License for more details.
*
*	You should have received a copy of the GNU General Public License
*	along with SIGRS. If not, see <http://www.gnu.org/licenses/>.
*/   
package SIGRS;


import java.io.File;
import java.io.BufferedReader;
import java.io.FileReader;
    
/**
*	SIGRS is a collection of routines used in searching for regions of contrasting composition (CCRs) in sequence files using a partial sum process.
*	Significance of segments is evaluated using Karlin-Altschul statistics and specifically an extension by Karlin-Dembo allowing
*	for nucleotides to have a Markov-dependence (see e.g. <a href="http://www.pnas.org/cgi/reprint/90/12/5873" target="_blank">Karlin & Altschul (1993)</a> and
*	<a href="http://www.jstor.org/view/00018678/ap050087/05a00070/0" target="_blank">Karlin & Dembo (1992)</a>
*	<P>
*	The routines are provided as is and no guarantee regarding stability etc. is given so use at your own risk!
*	<P>
*	<P>
*	See publication <b>Larsson, P., Hinas, A., Ardell, D.H., Kirsebom, L.A., Virtanen, A. and Sderbom, F.</b> <i>De novo search for non-coding RNA genes in the AT-rich genome of Dictyostelium discoideum: performance of
*	Markov-dependent genome feature scoring</i>
*	<P>
*	Questions and comments can be directed to <a href="mailto:Pontus.Larsson@icm.uu.se">Pontus.Larsson@icm.uu.se</a>
*	@author Pontus Larsson
*/

public class SIGRSScanForCCR {

	private double ALPHA;
	private int N;
	private int model;
	private double expect;
	private String[] seqId;
	private int[] seqLength;
	private File[] inFile;
	private SIGRSScoreObject sso;

	public SIGRSScanForCCR(SIGRSScoreObject sso) {
		this.sso = sso;
		this.ALPHA = 1.;
		this.model = -1;
		this.expect = -1.;
		this.seqId = new String[0];
		this.seqLength = new int[0];
		this.N = 0;
	}

	private void addSeqId(String id) {this.seqId = Methods.addElement(new String(id),seqId);}
	private void addSeqLength(int len) {this.seqLength = Methods.addElement(len,seqLength);}
	public void setN(int N) {this.N = N;}
	public void setModel(int model) {this.model = model;}
	public void setExpect(double expect) {this.expect = expect;}
	public void setAlpha(double alpha) {this.ALPHA = alpha;}
	public void setInFile(File[] inFile) throws Exception {
		this.inFile = inFile;
		this.seqId = new String[0];
		this.seqLength = new int[0];
		this.N = 0;
		for (int i=0; i<inFile.length; i++)
			this.N += Methods.sum(Methods.subarray(0,5,Methods.countMonoNucleotides(this.inFile[i])));
	}

	public int getN() {return this.N;}
	public String[] getSeqId() {return this.seqId;}
	public int[] getSeqLength() {return this.seqLength;}
	public int getModel() {return this.model;}

	/**
	*	Scores an input sequence and identifies contrasting composition regions (CCRs)
	*	@param direction The direction that is searched. The input sequence is always searched in the forward direction, this is to get the annotations correct. 1 -> forward strand, -1 -> reverse strand
	*	@return A matrix with CCRs where each row is a CCR and the columns are [0] -> start index (inclusive), [1] -> end index (exclusive), [2] -> orientation, [3] -> expect
	*/
	public double[][] findCCRs(int direction) throws Exception {
		double[][] CCR = new double[0][0];
		for (int i=0; i<this.inFile.length; i++) {
			CCR = Methods.append(findCCRs(this.inFile[i],direction),CCR);
		}
		return CCR;
	}
	private double[][] findCCRs(File seqFile, int direction) throws Exception {

		double[][] modelScores = new double[0][0];
		double K = 0;
		double L = 0;

		if (this.model == 0) {
			modelScores = this.sso.getM0ScoreMatrix();
			K = this.sso.getK0();
			L = this.sso.getL0();
		}
		else if (this.model == 1) {
			modelScores = this.sso.getM1ScoreMatrix();
			K = this.sso.getK1();
			L = this.sso.getL1();
		}

		// Determine an initial score cutoff threshold
		double threshold;
		if (this.ALPHA <= 1)
			threshold = KAStatistics.cutoff(this.ALPHA,this.N,K,L);
		else
			threshold = this.ALPHA;

		// Prepare the score matrix to score ambiguous and masked bases
		// Masked bases are scored minus infinity. Dinucleotides involving
		// an ambiguous base are scored the minimum score possible for that dinucleotide

		double min = Methods.min(modelScores);
		double negInf = Double.NEGATIVE_INFINITY;

		int rows = modelScores.length;
		int cols = modelScores[0].length;

		double[][] scores = new double[0][];
		if (model == 0) {
			// Add scores for masked and ambiguous nucleotides
			scores = new double[rows+2][cols];
			// An ambiguous nucleotide is scored the lowest score possible
			scores[0][0] = min;
			// A masked nucleotide is scored -Inf
			scores[rows+1][0] = negInf;
			// Fill in the rest of the scores
			for (int i=0; i<rows; i++)
				scores[i+1][0] = modelScores[i][0];
		}
		else if (model == 1) {
			// Add scores for masked and ambiguous nucleotides
			scores = new double[rows+2][cols+2];
			// The last row & column holds the score when encountering a masked segment = -Inf
			scores[rows+1] = Methods.newDoubleArray(cols+2,negInf);
			scores = Methods.setColumn(scores,cols+1,Methods.newDoubleArray(rows+2,negInf));

			// The first row holds the scores for going FROM an ambiguous nucleotide.
			// Set to be the lowest score possible when going to the destination nucleotide
			for (int i=0; i<cols; i++)
				scores[0][i+1] = Methods.min(Methods.getColumn(i,modelScores));
			// The first column holds the scores for going TO an ambiguous nucleotide.
			// Set to be the lowest score possible when going from the first nucleotide
			for (int i=0; i<rows; i++)
				scores[i+1][0] = Methods.min(modelScores[i]);

			// Encountering a dinucleotide with both nucleotides ambiguous is assigned the lowest score possible
			scores[0][0] = min;

			for (int i=0; i<rows; i++)
				for (int j=0; j<cols; j++)
					scores[i+1][j+1] = modelScores[i][j];
		}

		double score = 0;

		double[][] CCR = new double[0][5];
		int next = 0;
		double sum = 0;
		double max = 0;
		boolean inExcursion = false;
		boolean inIdLine = false;
		int maxIndex = 0;
		double e;

		char[] cBuff;
		// Buffersize (50% of available memory or 64 MB, whichever is the least)
		int chunkSize = Math.min((int) (0.5 * ((int) Runtime.getRuntime().totalMemory())),64*1024*1024);

		BufferedReader br = new BufferedReader(new FileReader(seqFile));
		int rd = 0;
		int i = 0;
		int buffIndex;
		String name = new String();
		int last;
		int present = -1;
		int seqIndex = this.seqId.length-1;

		while (true) {
			cBuff = new char[chunkSize];
			buffIndex = 0;
			rd = br.read(cBuff,0,chunkSize);
			if (rd < 0) {
				if (inExcursion) {
					CCR = endExcursion(CCR,max,maxIndex,direction,this.N,K,L,this.expect,threshold);
					inExcursion = false;
				}
				this.addSeqLength(i);

				// If we are searching reverse complement sequence, correct
				// starting and stopping positions now that we know the length
				// of the sequence
				if (direction < 0) {
					for (int j=next; j<CCR.length; j++) {
						if (CCR[j][2] < 0) {
							double t = CCR[j][1];
							CCR[j][1] = i-CCR[j][0];
							CCR[j][0] = i-t;
						}
					}
				}
				break;
			}
			while (buffIndex < rd) {

				last = present;
				present = -1;

				// A line break is encountered, 10 = '\n', 13 = '\r'
				if (cBuff[buffIndex] == 10 || cBuff[buffIndex] == 13) {
					// Jump over the line break
					buffIndex = Methods.skipLineBreak(cBuff,buffIndex,rd);
					if (buffIndex == rd)
						break;
				}

				// A new sequence begins, 62 = '>'
				if (cBuff[buffIndex] == 62 || inIdLine) {
					// End the current excursion if within one
					if (inExcursion) {
						CCR = endExcursion(CCR,max,maxIndex,direction,this.N,K,L,this.expect,threshold);
						inExcursion = false;
					}
					// If we are searching reverse complement sequence, correct
					// starting and stopping positions now that we know the length
					// of the sequence
					if (direction < 0) {
						for (int j=next; j<CCR.length; j++) {
							if (CCR[j][2] < 0) {
								double t = CCR[j][1];
								CCR[j][1] = i-CCR[j][0];
								CCR[j][0] = i-t;
							}
						}
					}
					if (!inIdLine) {
						if (i > 0)
							this.addSeqLength(i);
						name = new String();
						seqIndex++;
						inIdLine = true;
					}
					last = -1;
					next = CCR.length;
					i = 0;
					while (buffIndex < rd && cBuff[buffIndex] != 10 && cBuff[buffIndex] != 13) {
						name += cBuff[buffIndex];
						buffIndex++;
					}
					buffIndex = Methods.skipLineBreak(cBuff,buffIndex,rd);
					if (buffIndex == rd)
						break;
					this.addSeqId(name.substring(1));
					inIdLine = false;
				}

				// Change to upper case
				if (cBuff[buffIndex] >= 97 && cBuff[buffIndex] <= 122)
					cBuff[buffIndex] -= 32;
				// Change U to T
				if (cBuff[buffIndex] == 85)
					cBuff[buffIndex] = 84;

				present = Math.max(Methods.nucleotideCode.indexOf(cBuff[buffIndex]),0);
				i++;

				if ((model == 0 && present >= 0 && present <= 5) || (model == 1 && last >= 0 && last <= 5 && present >= 0 && present <= 5)) {
					if (model == 0)
						score = scores[present][0];
					else if (model == 1)
						score = scores[last][present];

					// Is this the start of an excursion?
					if (!inExcursion && score > 0) {
						CCR = Methods.addElement(new double[5],CCR);
						CCR[CCR.length-1][0] = i-2;
						CCR[CCR.length-1][4] = seqIndex;
						maxIndex = i-1;

						// For independent nucleotides, the hit begins at the present nucleotide
						if (model == 0)
							CCR[CCR.length-1][0] = i-1;

						sum = score;
						max = sum;
						inExcursion = true;
					}
					// If within an excursion, is the aggregate score the maximum so far?
					else if (inExcursion && (sum += score) >= 0) {
						if (sum > max) {
							max = sum;
							maxIndex = i-1;
						}
					}
					// Is it the end of an excursion?
					if (inExcursion && sum < 0) {
						CCR = endExcursion(CCR,max,maxIndex,direction,N,K,L,expect,threshold);
						inExcursion = false;
					}
				}
				buffIndex++;
			}
		}
		return CCR;
	}

	private static double[][] endExcursion(double[][] CCR, double max, int maxIndex, int direction, int N, double K, double L, double expect, double threshold) {
		double e = KAStatistics.expect(max,N,K,L);

		int next = CCR.length-1;
		// Limit the excursion to the region between the start of the excursion and the peak
		CCR[next][1] = maxIndex+1;

		// Should this excursion be discarded?
		if (CCR[next][1] == CCR[next][0] || e > expect || max < threshold)
			CCR = Methods.subarray(0,next,CCR);

		// Store excursion coordinates and its expect
		else {
			CCR[next][2] = direction;
			CCR[next][3] = e;
		}

		return CCR;
	}
}

