package iptgxdb.insilico;

import java.awt.Color;
import java.io.File;
import java.io.FileWriter;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;

import iptgxdb.utils.CLIUtils;
import iptgxdb.utils.FastaReader;
import iptgxdb.utils.GenomeFeature;
import iptgxdb.utils.GenomeLocation;
import iptgxdb.utils.GenomeLocation.Strand;
import iptgxdb.utils.GenomicsUtil;
import iptgxdb.utils.UOBufferedWriter;
import iptgxdb.utils.Utils;

/**
 * SixFrameORFs does an ORF search on all six frames of a  
 * given input genome sequence.
 * 
 * @author Ulrich Omasits
 * @date 25.11.2011
 */
public class InSilicoORFs {
	// fixStopCodons close any open reading frame.
	static List<String> fixStopCodons = Arrays.asList("TAA","TAG","TGA");
	// fixStartCodons open a new reading frame and at the same time prohibit any further reading frames to be opened afterwards
	//   therefore the longest possible ORF with a fixStartCodon is the main ORF and besides only extensions are allowed
	static List<String> fixStartCodons = Arrays.asList("ATG");
	// alternativeStartCodons are additional startCodons that will open a new reading frame (if still possible)
	static List<String> alternativeStartCodons = Arrays.asList("GTG","TTG","CTG");
	// a minimum length for an ORF in AA to be reported
	static int minProteinLength = 10;
	
	static boolean longestAlternativeAnchorOnly = false;
	
	@SuppressWarnings("serial")
	public static Options options = new Options() {{
		addOption( CLIUtils.createArgOption("seq", "file", "the input sequence", true, false) );
		addOption( CLIUtils.createArgOption("out", "file", "the output gff file", true, false) );
		addOption( CLIUtils.createArgOption("alt", "codon(s)", "alternative start codons (default: "+Utils.join(alternativeStartCodons," ")+")", false, true) );
		addOption( CLIUtils.createArgOption("min", "length", "the minimum protein length in aa (default: "+minProteinLength+")", false, false) );
		addOption( "laa", false, "take only longest alternative anchor for regions without a main start codon" );
		
		addOption( CLIUtils.createArgOption("tab", "output", "a tab-separated output file with sequences per entry", false, false) );
		addOption( CLIUtils.createArgOption("extend", "e", "get extension for all features 3' and 5' by <e> nucleotides in tabular output file", false, false) );
	}};
	
	public static void printUsageAndExit() {
		new HelpFormatter().printHelp("java -jar iPtgxDB_insilico.jar", "In silico ORF predictor by Ulrich Omasits", options, null, true);
		System.exit(0);
	}
	
	public static void main(String[] args) throws Exception {
		if (args.length>0 && args[0].equals("debug")) {
			args = new String[]{	
					"-seq","C:/temp/NA1000.fasta",
					"-out","C:/temp/na1000_orfs_min18aa.gff3",
					"-min","18"
			};
		}

		// parse the command line arguments
		CommandLine cli = null;
		try {
			cli = new PosixParser().parse( options, args );
		} catch (ParseException e) {
			System.out.println(e.getMessage());
			printUsageAndExit();
		}
		
		// required arguments
		File fInSeq = new File(cli.getOptionValue("seq")); 
		File fOutGFF = new File(cli.getOptionValue("out"));
		if (fOutGFF.exists()) {
			System.err.println("ERROR: " + fOutGFF.getName() + " already exists.");
			System.exit(0);
		}
		File fTab = CLIUtils.getFileOption(cli, "tab", true);
		Integer extend = Integer.valueOf(cli.getOptionValue("extend", "0"));
		
//		String seqId = cli.getOptionValue("id"); //"Bhen_NC005956"
		
		// optional arguments
		if (cli.hasOption("alt"))
			alternativeStartCodons = Arrays.asList(cli.getOptionValues("alt"));
		if (cli.hasOption("min"))
			minProteinLength = Integer.parseInt(cli.getOptionValue("min"));
		
		longestAlternativeAnchorOnly = cli.hasOption("laa");
		
		//final BufferedReader inSeq = Utils.reader(fInSeq);
		final UOBufferedWriter outGFF = new UOBufferedWriter(new FileWriter(fOutGFF));
		UOBufferedWriter outTab = null;
		if (fTab != null) {
			outTab = new UOBufferedWriter(fTab);
			outTab.writeTsvLine("id", "chromosome", "from", "to", "strand", "frame", "startCodon", "extension", "sequence", extend+"nt upstream", extend+"nt downstream");
		}
		
		outGFF.writeLine(GenomicsUtil.createGFFheader("allORFs", Color.DARK_GRAY));
		
		// read in entire sequence - could cause memory problems for large genomes...
		System.out.println("INFO: Reading sequence from '" + fInSeq.getName() + "'...");		
		Map<String,String> fasta = FastaReader.readFile(fInSeq, FastaReader.headerComplete);
		System.out.println("INFO: Reading sequence from '" + fInSeq.getName() + "' done!");
		
		int count_orfs = 0;
		int count_stops = 0;
		
		// iterate over all sequences
		for (Entry<String,String> e : fasta.entrySet()) {
			StringBuilder seqForward = new StringBuilder(e.getValue());
			String seqId = e.getKey();
			
			/*
			 * process the sequence: search ORFs
			 */
			System.out.println("INFO: Searching for ORFs in "+seqId+"...");
			int n = seqForward.length();
			StringBuilder seqReverse = GenomicsUtil.reverseNucleotides(seqForward.toString());
			
			List<String> allStartCodons = Utils.concatLists(fixStartCodons, alternativeStartCodons);
			
			// scan the sequence first forward, then reverse
			for (Strand strand : Arrays.asList(Strand.PLUS, Strand.MINUS)) {
				StringBuilder seq = (strand==Strand.PLUS) ? seqForward : seqReverse;
				
				// scan all three frames
				for (int frame : Arrays.asList(0,1,2)) {
					
					// scan sequence for relevant codons
					SortedMap<Integer,String> foundStartCodons = new TreeMap<Integer,String>(); // stores all possible starts
					boolean fixStartCodonFound = false;
					for(int pos=0+frame; pos<n-2; pos+=3) {
						String codon = seq.substring(pos, pos+3);
						if (!fixStartCodonFound && allStartCodons.contains(codon)) {
						// start codon found and still allowed to open new reading frames -> save!
							foundStartCodons.put(pos, codon);
							if (fixStartCodons.contains(codon))
								fixStartCodonFound = true;
						} else if (fixStopCodons.contains(codon)) {
						// stop codon found!
							int endInSeq = pos+3;
							
							if (longestAlternativeAnchorOnly & foundStartCodons.size() > 1 && ! fixStartCodonFound) {
								// no ATG was found -> keep only the longest alternative ORF!
								int longestStart = foundStartCodons.firstKey(); // is sorted!
								String longestCodon = foundStartCodons.get(longestStart);
								foundStartCodons.clear();
								foundStartCodons.put(longestStart, longestCodon);
							}
							// remove too short ORFs
							for (Integer startInSeq : new HashSet<Integer>(foundStartCodons.keySet())) {
								int length = endInSeq - startInSeq;
								int lengthAA = length/3 - 1; // nucleotides/3 minus the stop-codon
								if (lengthAA < minProteinLength)
									foundStartCodons.remove(startInSeq);
							}
							if (foundStartCodons.size() > 0) {
								count_stops++;
							}
							// write out ORFs for all saved start codons
							for (Integer startInSeq : foundStartCodons.keySet()) {
								count_orfs++;
								GenomeFeature gff = new GenomeFeature();
								gff.seqId = seqId;
								gff.source = "sixFrameORF";
								gff.type = "CDS";
								int from = (strand==Strand.PLUS) ? (startInSeq+1) : (n-endInSeq+1);
								int to = (strand==Strand.PLUS) ? (endInSeq) : (n-startInSeq);
								gff.location = new GenomeLocation(from, to, strand, seqId);
								String strFrame = strand + String.valueOf(frame+1);
								String id = "ORF_" + gff.location.from + ".." + gff.location.to + "_" + gff.location.lengthAA() + "aa_" + strFrame;
								gff.setID(id);
								gff.setAtt("frame", strFrame);
								gff.setAtt("startCodon", foundStartCodons.get(startInSeq));
								if (startInSeq != foundStartCodons.lastKey()) {
									gff.setAtt("extension", Utils.signedString( (foundStartCodons.lastKey()-startInSeq)/3 ) + "aa" );
								}
								gff.setAtt("pseudo", "false");
								outGFF.writeLine(gff.toString());
								if (outTab != null) {
									GenomeLocation upstream = null;
									GenomeLocation downstream = null;
									if (extend>0) {
										if (gff.location.strand==Strand.PLUS) {
											if (gff.location.from>1)
												upstream = new GenomeLocation(Math.max(gff.location.from-extend,1), gff.location.from-1, gff.location.strand, gff.location.chromosome);
											downstream = new GenomeLocation(gff.location.to+1, gff.location.to+extend, gff.location.strand, gff.location.chromosome);
										} else if (gff.location.strand==Strand.MINUS) {
											upstream = new GenomeLocation(gff.location.to+1, gff.location.to+extend, gff.location.strand, gff.location.chromosome);
											downstream = new GenomeLocation(gff.location.from-extend, gff.location.from-1, gff.location.strand, gff.location.chromosome);
										}
									}
									outTab.writeTsvLine(gff.getID(), gff.location.chromosome, gff.location.from, gff.location.to, gff.location.strand, gff.getAtt("frame"), gff.getAtt("startCodon"), gff.getAtt("extension"), gff.location.getSequence(seqForward), 
											upstream!=null ? upstream.getSequence(seqForward) : "",
											downstream!=null ? downstream.getSequence(seqForward) : ""
									);
								}
							}
							foundStartCodons.clear();
							fixStartCodonFound = false;
						}
					} // end sequence scan
				} // end frames
			} // end strands
		} // end sequences loop
		outGFF.close();
		if (outTab != null)
			outTab.close();
		
		System.out.println("INFO: Searching for ORFs done! Extracted "+count_orfs+" ORFs for "+count_stops+" stop sites to '"+fOutGFF.getName()+"'.");
	}

}
