package util;



import java.util.Vector;
import java.util.Hashtable;
import java.io.*;
import org.apache.regexp.*;
import java.util.regexp.*;

/**
 * Read in data/molecular-function file.
 * Goes through the gene ontology database and pulls out proteins if your ID matches
 * then it'll pull out annotations for that protein.
 * processRegexResults
 * 
 * @author bee
 */
public class PFunGO {
	
    // Denotes whether these members of the proteins
    // should be pulled out of the SwissProt DB.
    private static boolean nameP;
    private static boolean goP;
    private static boolean mocP;
    private static String goFileName;

    private static Vector proteinList; // list of pnums
    private static Hashtable proteinData; // list of PfunProteins, indexed by pnum
    private static int lineNum;
    private static PfunProtein currentProtein;

    // Regular Expression Patterns
    public static Pattern reIdPattern;
    public static Pattern reNamePattern;			
    public static Pattern reGOPattern;
    public static Pattern reMOCPattern;

    public static void setupRegex () {

        String go = "([0-9]+)";
        String id = "([A-Z0-9]+)";
        String nameId = "([a-zA-Z0-9]+\\_[a-zA-Z0-9]+)";
        String sequence = "([A-Z\\s]+)";
        String moc = "\t(IMP)\t|\t(IGI)\t|\t(IPI)\t|\t(ISS)\t|\t(IDA)\t|\t(IEP)\t|\t(IEA)\t|\t(TAS)\t|\t(NAS)\t|\t(IGC)\t|\t(RCA)\t|\t(ND)\t|\t(IC)\t|\t(P)\t|\t(E)\t|\t(NR)\t";
        String species = "([^.]+)";
        String ecNumber = "([^)]+)";
        String idPattern = "SPTR\\s"+id;
        String namePattern = "\t"+nameId; 
        String goPattern = "GO:"+go;
        String mocPattern = moc;
         
	//System.out.println(regexLex);
	reIdPattern = Pattern.compile(idPattern);
	reNamePattern = Pattern.compile(namePattern);			
	reGOPattern = Pattern.compile(goPattern);
	reMOCPattern = Pattern.compile(mocPattern);
    }

    public void processRegexResults (String s) 
    {
    	String paren0, paren1;

    	Matcher matchesIdPattern;
    	Matcher matchesNamePattern;
    	Matcher matchesGOPattern;
    	Matcher matchesMOCPattern;
 
    	matchesIdPattern = reIdPattern.matcher(s);
    	if(matchesIdPattern.matches()) {
	    paren1 = matchesIdPattern.group(1);
	    if(paren1 != null) {
		//Check to see whether ID matches one on
		// the list we are looking for. If not, skip it.
		if(proteinList.contains(paren1)) {
		    //if(currentProtein != null) 
		    //	proteinData.put(currentProtein.getID(), currentProtein);
		    // Do I need to copy the protein into the vector?
		    if(proteinData.containsKey(paren1)) {
			currentProtein = (PfunProtein)proteinData.get(paren1);
		    } else {
			currentProtein = new PfunProtein(paren1);
		    }
		} else {
		    return;
		}
	    }
    	}

       	matchesNamePattern = reNamePattern.matcher(s);
	if(matchesNamePattern.matches()) {
	    paren0 = matchesNamePattern.group(0);
	    paren1 = matchesNamePattern.group(1);
	    if(paren1 != null) {
		currentProtein.setName(paren1);
	    }
	}	
    
	if(goP) {
	    matchesGOPattern = reGOPattern.matcher(s);
	    if(matchesGOPattern.matches()) {
		paren0 = matchesGOPattern.group(0);
		paren1 = matchesGOPattern.group(1);
		if(paren1 != null) {
		    currentProtein.setGONumber(paren1);
		}
	    }
    	}
    	if(mocP) {
	    matchesMOCPattern = reMOCPattern.matcher(s);
	    if(matchesMOCPattern.matches()) {
		paren0 = matchesMOCPattern.group(0);
		paren1 = paren0.substring(1, paren0.length()-1);
		if(paren1 != null) {
		    currentProtein.setMOC(paren1);
		}
	    }
    	}	
    }
    
    public PFunGO(String fileName, boolean pName, 
		  boolean pGo, boolean pMoc)
    {
	nameP = pName;
	goP = pGo;
	mocP = pMoc;
	goFileName = fileName;
	setupRegex();
    }

    // This function reads through the GO/Swissprot
    // file and fills out the fields specified in 
    // the constructor, and returns a vector of
    // PfunProteins.
    public Hashtable fillOutList(Vector spNums)
    {
    	return fillOutList(spNums, new Hashtable());
    }
    
    public Hashtable fillOutList(Vector spNums, Hashtable pData)
    {
	proteinData = pData;
	String str = null;
	BufferedReader fin;
	lineNum = 0;
	currentProtein = null;
	proteinList = spNums;
		
        try {
            fin = new BufferedReader(new FileReader (goFileName));
 
	    try {
            	while ((str=fin.readLine()) != null) {
		    lineNum++;
		    processRegexResults(str);
            	}
	    }
	    catch (Exception e) {
            	System.out.println("Error in ProcessFileGO: " + str + " " + lineNum + " " + e.getMessage());
	    }
	    fin.close();
        }
        catch (Exception ioe) {
	    System.err.println("ProcessFileGO: " + goFileName + " " +
			       ioe.getMessage());
            System.exit(1);
        }
        if(currentProtein != null) proteinData.put(currentProtein.getID(), currentProtein);
	return proteinData;
    }

}
