
/**
*	SIGRS - Identifying genomic regions of contrasting composition using a partial sum process
*	Copyright (C) 2008 Pontus Larsson
*	 
*	This file is part of SIGRS.
*	  
*	SIGRS is free software: you can redistribute it and/or modify
*	it under the terms of the GNU General Public License as published by
*	the Free Software Foundation, either version 3 of the License, or
*	(at your option) any later version.
*
*	SIGRS is distributed in the hope that it will be useful,
*	but WITHOUT ANY WARRANTY; without even the implied warranty of
*	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*	GNU General Public License for more details.
*
*	You should have received a copy of the GNU General Public License
*	along with SIGRS. If not, see <http://www.gnu.org/licenses/>.
*/       


package SIGRS;

import java.io.File;
import java.util.Date;

/**
*	This is the main class for the SIGRS routines.
*	<P>
*	SIGRS is a collection of routines used in searching for regions of contrasting composition (CCRs) in sequence files using a partial sum process.
*	Significance of segments is evaluated using Karlin-Altschul statistics and specifically an extension by Karlin-Dembo allowing
*	for nucleotides to have a Markov-dependence (see e.g. <a href="http://www.pnas.org/cgi/reprint/90/12/5873" target="_blank">Karlin & Altschul (1993)</a> and
*	<a href="http://www.jstor.org/view/00018678/ap050087/05a00070/0" target="_blank">Karlin & Dembo (1992)</a>
*	<P>
*	The routines are provided as is and no guarantee regarding stability etc. is given so use at your own risk!
*	<P>
*	<P>
*	See publication <b>Larsson, P., Hinas, A., Ardell, D.H., Kirsebom, L.A., Virtanen, A. and Sderbom, F.</b> <i>De novo search for non-coding RNA genes in the AT-rich genome of Dictyostelium discoideum: performance of
*	Markov-dependent genome feature scoring</i>
*	<P>
*	Questions and comments can be directed to <a href="mailto:Pontus.Larsson@icm.uu.se">Pontus.Larsson@icm.uu.se</a>
*	@author Pontus Larsson
*
*/

public class SIGRSMain {

	private static String sourceName = new String("SIGRS");
	private static String className = new String("SIGRS MODEL CCR");

	/**
	*	Main method. Can be launched either to scan sequences for CCRs or just to calculate a score object. For the scan, a score object must be specified,
	*	either as saved parameters previously generated by the program (-S option) or through computations through specified background and target sequence
	*	files (-BG and -TG options). The target sequence set can be subdivided into classes based on tags in the id line of each sequence. Each class will
	*       then be weighted equally in the calculation of scores regardless of the number of sequences in each class. The classes are specified by the -C option
	*	followed by a space-separated list of tags (if spaces are present within a tag, the entire tag needs to be quoted). Input files to scan for CCRs are specified through the -IN option. Expect cutoff can be specified with the
	*	-E option. The database search space size can be manually specified with the -N option if needed.
	*	The model type to use can be specified by the -M option, where 0 means M0-model which assumes independent nucleotides and 1 means M1-model which assumes
	*	nucleotides with first-order Markov dependence. -OUT can be used to specify a file to write program output to instead of stdout. Progress output can be redirected with the -LOG option
	*/
	public static void main(String[] args) {
		String usage = new String("\nSIGRSMain\n\n"+
					  "\t-BG\tBackground sequences file in FASTA format\n"+
					  "\t-TG\tTarget sequences file in FASTA format\n"+
					  "\t-C\tOptional division of training set into classes. The classes will be weighted equally\n"+
					  "\t-S\tA file with a saved score object\n"+
					  "\t-IN\tSpace separated list of input files in fasta format to scan for CCRs\n"+
					  "\t-OUT\tA file to write program results to instead of stdout\n"+
					  "\t-E\tExpect value to use as cutoff for reported CCRs [Default 0.1]\n"+
					  "\t-M\tModel to use for search, 0 -> Independent nucleotides, 1 -> Markov-dependent nucleotides [Default 1]\n"+
					  "\t-N\tManually specify the database size to use\n"+
					  "\t-SYM\tUse symmetric strands instead of treating strands independently\n"+
					  "\t-LOG\tSpecify a log file to write progress to\n"+
					  "\t-h\tDisplay this help message");
		int index;
		if (Methods.indexOf("-h",args) >= 0 || Methods.indexOf("--help",args) >= 0) {
			System.out.println(usage);
			System.exit(0);
		}

		File backgroundFile = null;
		File targetFile = null;
		File scoreFile = null;
		File logFile = null;
		File outFile = null;
		File[] inFile = new File[0];
		String[] targetClasses = new String[0];
		boolean symmetric = true;

		if ((index = Methods.indexOf("-BG",args)) >= 0 && args.length > (index+1))
				backgroundFile = new File(args[index+1]);
		if ((index = Methods.indexOf("-TG",args)) >= 0 && args.length > (index+1))
				targetFile = new File(args[index+1]);
		if ((index = Methods.indexOf("-C",args)) >= 0 && args.length > (index+1)) {
			while ((index+1) < args.length && args[index+1].charAt(0) != '-') {
				index++;
				targetClasses = Methods.addElement(args[index],targetClasses);
			}
		}
		if ((index = Methods.indexOf("-SYM",args)) >= 0 && args.length > (index+1))
			symmetric = (args[index+1].compareTo("F") != 0);
		if ((index = Methods.indexOf("-S",args)) >= 0 && args.length > (index+1))
				scoreFile = new File(args[index+1]);
		if ((index = Methods.indexOf("-LOG",args)) >= 0 && args.length > (index+1))
				logFile = new File(args[index+1]);
		if ((index = Methods.indexOf("-OUT",args)) >= 0 && args.length > (index+1))
				outFile = new File(args[index+1]);
		if ((index = Methods.indexOf("-IN",args)) >= 0 && args.length > (index+1)) {
			while ((index+1) < args.length && args[index+1].charAt(0) != '-') {
				index++;
				inFile = Methods.addElement(new File(args[index]),inFile);
			}
		}

		try {
			LogWriter lw;
			if (logFile != null)
				lw = new LogWriter(logFile);
			else
				lw = new LogWriter();

			LogWriter out;
			if (outFile != null)
				out = new LogWriter(outFile);
			else
				out = new LogWriter();

			lw.println("# "+(new Date()).toString()+"\tProgram invoked with command:");
			out.println("# "+(new Date()).toString()+"\tProgram invoked with command:");
			for (int i=0; i<args.length; i++) {
				lw.println("#\t"+args[i]);
				out.println("#\t"+args[i]);
			}

			SIGRSScoreObject sso = null;
			if (scoreFile != null) {
				sso = SIGRSScoreObject.getScoreObject(scoreFile,lw);
				if (backgroundFile != null) {
					byte[][] seqs = getBackgroundSeqs(backgroundFile,symmetric,lw);
					sso.calculateBackgroundFrequencies(seqs,lw);
					sso.calculateScores(lw);
				}
				if (targetFile != null) {
					byte[][] seqs = getTargetSeqs(targetFile,targetClasses,symmetric,lw);
					sso.calculateTargetFrequencies(seqs,lw);
					sso.calculateScores(lw);
				}
				if (inFile.length == 0)
					out.println(sso.toString());
			}
			else if (backgroundFile != null && targetFile != null) {
				sso = getScoreObject(backgroundFile,targetFile,targetClasses,symmetric,lw);
				out.println(sso.toString());
			}

			if (inFile.length > 0 && sso != null) {
				double expect = 0.1;
				int model = 1;
				int N = -1;
				String label = null;
				if ((index = Methods.indexOf("-E",args)) >= 0 && args.length > (index+1))
						expect = Double.parseDouble(args[index+1]);
				if ((index = Methods.indexOf("-M",args)) >= 0 && args.length > (index+1))
						model = Integer.parseInt(args[index+1]);
				if ((index = Methods.indexOf("-N",args)) >= 0 && args.length > (index+1))
						N = Integer.parseInt(args[index+1]);
				if ((index = Methods.indexOf("-L",args)) >= 0 && args.length > (index+1))
						label = args[index+1];
				scanForCCRs(inFile,model,sso,expect,N,symmetric,label,lw,out);
			}

			if (logFile != null)
				lw.close();
			if (outFile != null)
				out.close();

		} catch (Exception e) {
			e.printStackTrace();
			System.out.println(usage);
		}
	}

	/**
	*	Creates a SIGRSScoreObject that calculates the scores and parameters from the given backgound and target sequence files
	*	@param backgroundFile A FASTA file containing sequence(s) representative of the background search space
	*	@param targetFile A FASTA file containing sequence(s) representative of the target distributions
	*	@return A SIGRSScoreObject
	*/
	public static SIGRSScoreObject getScoreObject(File backgroundFile, File targetFile, boolean symmetric) throws Exception {return getScoreObject(backgroundFile, targetFile, new String[0], symmetric, new LogWriter());}
	private static SIGRSScoreObject getScoreObject(File backgroundFile, File targetFile, String[] targetClasses, boolean symmetric, LogWriter lw) throws Exception {

		byte[][] targetSeqs = getTargetSeqs(targetFile,targetClasses,symmetric,lw);

		SIGRSScoreObject sso = new SIGRSScoreObject();
		lw.println("# "+(new Date()).toString()+"\tBegins calculating scores");
		File tempFile = backgroundFile;
		if (symmetric) {
			tempFile = File.createTempFile(backgroundFile.getName()+"_revcomp",".fa");
			tempFile.deleteOnExit();
			lw.println("# "+(new Date()).toString()+"\tGets reverse complement of "+backgroundFile.getName());
			Methods.reverseComplement(backgroundFile,tempFile);
			lw.println("# "+(new Date()).toString()+"\tConcatenates "+backgroundFile.getName()+" and "+tempFile.getName());
			Methods.append(backgroundFile,tempFile);
		}
		sso.setBackgroundFrequencies(SIGRSScoreObject.calculateFrequencies(tempFile));
		sso.setTargetFrequencies(SIGRSScoreObject.calculateFrequencies(targetSeqs));
		sso.calculateScores(lw);

		return sso;
	}

	private static byte[][] getBackgroundSeqs(File backgroundFile, boolean symmetric, LogWriter lw) throws Exception {
		byte[][] backgroundSeqs = new byte[0][];
		lw.println("# "+(new Date()).toString()+"\tParses background file "+backgroundFile.getPath());
		// Assumes background file is big
		String[][] bg = Methods.parseBigFasta(backgroundFile);
		backgroundSeqs = new byte[1][0];
		lw.println("# "+(new Date()).toString()+"\tEncodes background sequences");
		for (int i=0; i<bg.length; i++) {
			backgroundSeqs[0] = Methods.append(Methods.addElement((byte) 0,Methods.encode(bg[i][1])),backgroundSeqs[0]);
			if (symmetric) {
			// Use both strands
				backgroundSeqs[0] = Methods.append(Methods.addElement((byte) 0, Methods.reverseComplement(Methods.encode(bg[i][1]))),backgroundSeqs[0]);
			}
		}

		return backgroundSeqs;
	}

	private static byte[][] getTargetSeqs(File targetFile, String[] targetClasses, boolean symmetric, LogWriter lw) throws Exception {
		byte[][] targetSeqs = new byte[0][];
		lw.println("# "+(new Date()).toString()+"\tParses target file "+targetFile.getPath());
		String[][] tg = Methods.parseFasta(targetFile);
		if (targetClasses.length == 0) {
			targetSeqs = new byte[1][0];
			lw.println("# "+(new Date()).toString()+"\tEncodes target sequences");
			for (int i=0; i<tg.length; i++) {
				targetSeqs[0] = Methods.append(Methods.addElement((byte) 0,Methods.encode(tg[i][1])),targetSeqs[0]);
				if (symmetric) {
				// Use both strands
					targetSeqs[0] = Methods.append(Methods.addElement((byte) 0,Methods.reverseComplement(Methods.encode(tg[i][1]))),targetSeqs[0]);
				}
			}

		}
		else {
			targetSeqs = new byte[targetClasses.length][0];
			lw.println("# "+(new Date()).toString()+"\tEncodes target sequences and divides into "+String.valueOf(targetClasses.length)+" classes");
			for (int i=0; i<tg.length; i++) {
				// Concatenate all sequences belonging to the same class, separated by a 'N'
				// This will create a weighting so that all classes are weighted equally
				for (int j=0; j<targetClasses.length; j++) {
					if (tg[i][0].indexOf(targetClasses[j]) >= 0) {
						targetSeqs[j] = Methods.append(Methods.addElement((byte) 0,Methods.encode(tg[i][1])),targetSeqs[j]);
						// Use both strands
						if (symmetric) {
							targetSeqs[j] = Methods.append(Methods.addElement((byte) 0,Methods.reverseComplement(Methods.encode(tg[i][1]))),targetSeqs[j]);
						}
						break;
					}
				}
			}
			// Remove classes with no sequences
			for (int i=0; i<targetSeqs.length; i++) {
				if (targetSeqs[i].length == 0) {
					targetSeqs = Methods.removeElementAt(i,targetSeqs);
					i--;
				}
			}
		}

		return targetSeqs;
	}

	/**
	*	Scans the input files in both directions for contrasting composition regions (CCRs)
	*	@param inputFile An array containing the input files in FASTA format. Each file may contain multiple sequences
	*	@param model The model type to use. 0 -> M0 (Independent nucleotides), 1 -> M1 (Markov-dependent nucleotides)
	*	@param sso A SIGRSScoreObject that holds all the necessary scores and model parameters
	*	@param expect The expect cutoff to use when reporting hits
	*	@param lw A writer to write progress to
	*	@param out A writer to write the resulting CCRs to
	*/
	public static void scanForCCRs(File[] inputFile, int model, SIGRSScoreObject sso, double expect, int N, boolean symmetric, String label, LogWriter lw, LogWriter out) throws Exception {
		byte[][] byteSeqs = new byte[0][];
		String[][] seqs;
		String[] ids = new String[0];

		int n = 0;

		SIGRSScanForCCR sfc = new SIGRSScanForCCR(sso);
		sfc.setExpect(expect);
		sfc.setModel(model);
		sfc.setInFile(inputFile);

		double[][] CCRs;

		lw.println("# "+(new Date()).toString()+"\tTotal database size is "+String.valueOf(sfc.getN()));
		if (N > 0) {
			lw.println("# "+(new Date()).toString()+"\tDatabase size has been manually specified to "+String.valueOf(N));
			n = N;
			sfc.setN(N);
		}
		File[] revInp = new File[inputFile.length];
		if (!symmetric) {
			for (int i=0; i<inputFile.length; i++) {
				revInp[i] = new File(inputFile[i].getAbsolutePath()+".revcomp.fa");
				Methods.reverseComplement(inputFile[i],revInp[i]);
			}
			CCRs = sfc.findCCRs(1);
			int[] len = sfc.getSeqLength();
			ids = sfc.getSeqId();
			lw.println("# Seq lengths:");
			for (int i=0; i<len.length; i++)
				lw.println("#\t"+ids[i]+"\t"+String.valueOf(len[i])+" nt");
			writeGFFAnnotationOfCCRs(CCRs,sfc.getSeqId(),sfc.getModel(),label,out);

			sfc.setInFile(revInp);
			CCRs = sfc.findCCRs(-1);
			len = sfc.getSeqLength();
			ids = sfc.getSeqId();
			lw.println("# Seq lengths:");
			for (int i=0; i<len.length; i++)
				lw.println("#\t"+ids[i]+"\t"+String.valueOf(len[i])+" nt");
			writeGFFAnnotationOfCCRs(CCRs,sfc.getSeqId(),sfc.getModel(),label,out);

			for (int i=0; i<revInp.length; i++)
				revInp[i].delete();
		}
		else {
			CCRs = sfc.findCCRs(0);
			int[] len = sfc.getSeqLength();
			ids = sfc.getSeqId();
			lw.println("# Seq lengths:");
			for (int i=0; i<len.length; i++)
				lw.println("#\t"+ids[i]+"\t"+String.valueOf(len[i])+" nt");
			lw.println("# "+(new Date()).toString()+"\tTotal database size was "+String.valueOf(sfc.getN())+" Number of sequences was "+String.valueOf(sfc.getSeqId().length)+" "+String.valueOf(CCRs.length)+" CCRs found");
			writeGFFAnnotationOfCCRs(CCRs,sfc.getSeqId(),sfc.getModel(),label,out);
		}
	}

	/**
	*	Writes the contrasting composition regions (CCRs) to output in GFF format. The tab delimited columns are as follows:
	*	<br>
	*	<br>1 -> Sequence identifier
	*	<br>2 -> Source name
	*	<br>3 -> Family feature name
	*	<br>4 -> Start position
	*	<br>5 -> End position
	*	<br>6 -> Blank
	*	<br>7 -> Orientation
	*	<br>8 -> Expect value
	*	<br>9 -> Feature name
	*	<br>10 -> Class feature name
	*	@param CCRs An array holding the CCRs
	*	@param id A sequence identifier where the CCRs were found
	*	@param lw A reference to an output writer
	*/
	public static void writeGFFAnnotationOfCCRs(double[][] CCRs, String[] id, int model, String label, LogWriter lw) {
		String src = sourceName+" M"+String.valueOf(model);
		String cls = className.replace("MODEL","M"+String.valueOf(model));
		if (label != null)
			cls = label;
		for (int i=0; i<CCRs.length; i++) {
			//lw.print(id+"\t");
			lw.print(id[(int) CCRs[i][4]]+"\t");
			lw.print(src+"\t");
			lw.print(cls+"\t");
			lw.print(String.valueOf((int) CCRs[i][0]+1)+"\t");
			lw.print(String.valueOf((int) CCRs[i][1])+"\t");
			lw.print(" \t");
			lw.print((CCRs[i][2] > 0 ? "+" : (CCRs[i][2] < 0 ? "-" : "+/-"))+"\t");
			lw.print(String.valueOf(CCRs[i][3])+"\t");
			lw.print(" \t");
			lw.println(cls);
		}
	}

}

