001 package calhoun.analysis.crf.io;
002
003 import java.io.IOException;
004 import java.io.Serializable;
005 import java.util.Iterator;
006 import java.util.List;
007
008
009 /** interface to classes that handle reading and writing of input data for the CRF. The 'location' of the input
010 data is passed in via a text string. The interpretation of this text string is left up to the
011 <code>InputHandler</code> implementation. Commonly it will be a directory path, a file path, or some sort of
012 configuration string. The <code>InputHandler</code> returns {@link InputSequence}s and {@link TrainingSequence}s
013 to the engine.<p>
014 The <code>InputHandler</code> is also reponsible for writing out data. This is necessary for subsetting and other
015 partitioning utilities. As with reading, an implementation dependent location string is used to specify where the
016 data should be written. When writing data, it is safe for the input handler to assume that the sequences it receives
017 for writing are in the same format as those it created during reading.
018 */
019 public interface InputHandler extends Serializable {
020
021 /** returns the training data read from the specified location. Training data includes input data and
022 hidden sequences. The result is returned as a <code>Iterator</code> so algorithms are not forced to hold
023 all of the training data at once (although most will). The interpretation of
024 the location string is dependent on the particular <code>InputHandler</code> implementation used.
025 @param location string location of the data. Meaning is implementation dependent.
026 @return a list of training sequences
027 @throws IOException if there is a problem reading the data
028 */
029 List<? extends TrainingSequence<?>> readTrainingData(String location, boolean predict) throws IOException;
030 List<? extends TrainingSequence<?>> readTrainingData(String location) throws IOException;
031 /** returns the input data read from the specified location. The result is returned as an <code>Iterator</code>
032 because the inference algorithms can predict on the sequences one at a time. The interpretation of
033 the location string is dependent on the particular <code>InputHandler</code> implementation used.
034 @param location string location of the data. Meaning is implementation dependent.
035 @return an iterator over input sequences
036 @throws IOException if there is a problem reading the data
037 */
038 Iterator<? extends InputSequence<?>> readInputData(String location) throws IOException;
039
040 /** writes training data to the specified location. Training data includes input data and
041 hidden sequences. The interpretation of
042 the location string is dependent on the particular <code>InputHandler</code> implementation used.
043 @param location string location of the data. Meaning is implementation dependent.
044 @param data a list of training sequences to write out.
045 @throws IOException if there is a problem reading the data
046 */
047 void writeTrainingData(String location, List<? extends TrainingSequence<?>> data) throws IOException;
048
049 /** writes input data to the specified location. The interpretation of
050 the location string is dependent on the particular <code>InputHandler</code> implementation used.
051 @param location string location of the data. Meaning is implementation dependent.
052 @param data an iterator over input sequences
053 @throws IOException if there is a problem reading the data
054 */
055 void writeInputData(String location, Iterator<? extends InputSequence<?>> data) throws IOException;
056 }