package iptgxdb.converter;

import java.awt.Color;
import java.io.BufferedReader;
import java.io.File;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;

import iptgxdb.utils.GenomeFeature;
import iptgxdb.utils.GenomeLocation;
import iptgxdb.utils.Utils;

/**
 * Ensembl2GFF parses a plain-text Ensembl file 
 * (http://www.ensembl.org/index.html)
 * and converts it to a GFF file.
 * 
 * @author Ulrich Omasits
 * @date 16.11.2011
 */
public class Ensembl2GFF extends AConverter {
	@Override
	protected void convert_internal(File inputFile) throws Exception {
		BufferedReader in = Utils.reader(inputFile);
		final List<String> consideredTypes = Arrays.asList("CDS");
		//excluded: mat_peptide / sig_peptide / misc_structure / misc_RNA / ncRNA / rRNA / tmRNA / tRNA
		
		// read to the SQ entry (right before the sequence starts) and parse the features
		String line = in.readLine();
		String currChromosome = null;
		Map<String,String> attributes = null;
		do {
			if (line.startsWith("FT                   /chromosome=")) {
				currChromosome = StringUtils.substringBetween(line, "/chromosome=\"", "\"");
			} else if (line.matches("FT   \\w.+")) {
				String type = line.substring(2, 21).trim();
				if (extensive || consideredTypes.contains(type) || consideredTypes.size()==0) {
					// feature found!
					attributes = new LinkedHashMap<String, String>();
					attributes.put("location", line.substring(21));
					
	                String key = null;
	                String value = null;
	                while ((line=in.readLine()).startsWith("FT                   ")) {
	                	if (line.trim().startsWith("FT                   /")) {
	                		// save previous attribute
	                		if (key != null) {
	                			if (value.startsWith("\"") && value.endsWith("\"")) {
	                				value = value.substring(1, value.length()-1);
	                			}
	                			attributes.put(key, value);
	                		}
	                		int i = line.trim().indexOf('=');
	                		if (i==-1) {
	                			key = line.trim().substring(22);
	                			value = "true";
	                		} else {
	                			key = line.trim().substring(22,i);
	                			value = line.substring(i+1);
	                		}
	                	} else {
	                		value += " "+line.substring(21);
	                	}
//	                	if (line.trim().startsWith("FT                   /locus_tag"))
//	                		tag = line.substring(33, line.length()-1);
//	                	if (line.trim().startsWith("FT                   /pseudo"))
//	                		pseudo = true;
	                }
	             // save last attribute
	                if (key != null) {
		                if (value.startsWith("\"") && value.endsWith("\"")) {
	        				value = value.substring(1, value.length()-1);
	        			}
	        			attributes.put(key, value);
	                }
	                GenomeFeature gf = new GenomeFeature(seqId, source, type, new GenomeLocation(attributes.get("location"), seqId), null, null);
	                if (extensive)
						gf.atts.putAll(attributes);
					else if (attributes.containsKey("pseudo"))
						gf.atts.put("pseudo", "true");
					if (attributes.containsKey("locus_tag"))
						gf.setID(attributes.get("locus_tag"));
					else if (attributes.containsKey("gene"))
						gf.setID(attributes.get("gene"));
					else if (attributes.containsKey("ID"))
						gf.setID(attributes.get("ID"));
					
					this.addGenomeFeature(gf);
				} else {
					line = in.readLine();
				}
			} else {
				line = in.readLine();
			}
		} while ( line != null &&  !line.startsWith("SQ   ") ); // end of features, start of sequence
		in.close();
	}
	
	@Override
	protected Color getColor() {
		return Color.CYAN;
	}
}
