001    package calhoun.analysis.crf.io;
002    
003    import java.io.IOException;
004    import java.io.Serializable;
005    import java.util.Iterator;
006    import java.util.List;
007    
008    
009    /** interface to classes that handle reading and writing of input data for the CRF.  The 'location' of the input
010    data is passed in via a text string.  The interpretation of this text string is left up to the 
011    <code>InputHandler</code> implementation.  Commonly it will be a directory path, a file path, or some sort of 
012    configuration string.  The <code>InputHandler</code> returns {@link InputSequence}s and {@link TrainingSequence}s
013    to the engine.<p>
014    The <code>InputHandler</code> is also reponsible for writing out data.  This is necessary for subsetting and other
015    partitioning utilities.  As with reading, an implementation dependent location string is used to specify where the
016    data should be written.  When writing data, it is safe for the input handler to assume that the sequences it receives
017    for writing are in the same format as those it created during reading.
018    */
019    public interface InputHandler extends Serializable {
020            
021            /** returns the training data read from the specified location.  Training data includes input data and 
022            hidden sequences.  The result is returned as a <code>Iterator</code> so algorithms are not forced to hold
023            all of the training data at once (although most will).  The interpretation of
024            the location string is dependent on the particular <code>InputHandler</code> implementation used.
025            @param location string location of the data.  Meaning is implementation dependent.
026            @return a list of training sequences
027            @throws IOException if there is a problem reading the data
028            */
029            List<? extends TrainingSequence<?>> readTrainingData(String location, boolean predict) throws IOException;
030            List<? extends TrainingSequence<?>> readTrainingData(String location) throws IOException;
031            /** returns the input data read from the specified location.  The result is returned as an <code>Iterator</code> 
032            because the inference algorithms can predict on the sequences one at a time. The interpretation of
033            the location string is dependent on the particular <code>InputHandler</code> implementation used.
034            @param location string location of the data.  Meaning is implementation dependent.
035            @return an iterator over input sequences
036            @throws IOException if there is a problem reading the data
037            */
038            Iterator<? extends InputSequence<?>> readInputData(String location) throws IOException;
039    
040            /** writes training data to the specified location.  Training data includes input data and 
041            hidden sequences.  The interpretation of
042            the location string is dependent on the particular <code>InputHandler</code> implementation used.
043            @param location string location of the data.  Meaning is implementation dependent.
044            @param data a list of training sequences to write out.
045            @throws IOException if there is a problem reading the data
046            */
047            void writeTrainingData(String location, List<? extends TrainingSequence<?>> data) throws IOException;
048    
049            /** writes input data to the specified location.  The interpretation of
050            the location string is dependent on the particular <code>InputHandler</code> implementation used.
051            @param location string location of the data.  Meaning is implementation dependent.
052            @param data an iterator over input sequences
053            @throws IOException if there is a problem reading the data
054            */
055            void writeInputData(String location, Iterator<? extends InputSequence<?>> data) throws IOException;
056    }