001 package calhoun.analysis.crf.io;
002
003 import java.io.BufferedReader;
004 import java.io.BufferedWriter;
005 import java.io.File;
006 import java.io.FileReader;
007 import java.io.FileWriter;
008 import java.io.IOException;
009 import java.io.Reader;
010 import java.io.StringReader;
011 import java.util.ArrayList;
012 import java.util.Collections;
013 import java.util.HashMap;
014 import java.util.Iterator;
015 import java.util.List;
016 import java.util.Map;
017
018 import calhoun.util.Assert;
019
020 /** an {@link InputHandler} for handling input files that consist of multiple different sequences
021 * interleaved together in a file. This input handler has a list of {@link InterleavedInputComponent}s.
022 * When reading in a file, this InputHandler opens a reader on the file and passes the reader to each
023 * {@link InterleavedInputComponent} in turn for each sequence. Training data is assumed to be the first
024 * line of each sequence, using an {@link IntInput} to encode the hidden states.
025 * <p>
026 * This input handler is useful for test data that contains multiple inputs in a file along with the training data.
027 * It is included to support backwards compatibility with the old input format.
028 * <p>
029 * This input handler can also work with "literal" input, where the location string that is passed in is not a file
030 * name, but the actual input data. This is used frequently to pass small volumes of data in unit tests.
031 */
032 public class InputHandlerInterleaved implements InputHandler {
033 private static final long serialVersionUID = -2969140424776995686L;
034
035 List<InterleavedInputComponent> components;
036 IntInput hiddenStateReader = new IntInput();
037 boolean locationIsLiteral = false;
038 boolean singleComponent = false;
039 String componentName;
040
041 /** creates a new input handler, usually to be configured from an XML file
042 */
043 public InputHandlerInterleaved() {
044 }
045
046 /** creates a new input handler, containing a single {@link InterleavedInputComponent}
047 * @param base the single input component which is contained in the input file
048 */
049 public InputHandlerInterleaved(InterleavedInputComponent base) {
050 this.singleComponent = true;
051 components = Collections.singletonList(base);
052 }
053
054 /** creates a new input handler, containing a single {@link InterleavedInputComponent}
055 * @param base the single input component which is contained in the input file
056 * @param locationIsLiteral if true then the location string passed in to the read commands
057 * is the actual input data. Otherwise, it is the location of a file from which to read the data.
058 */
059 public InputHandlerInterleaved(InterleavedInputComponent base, boolean locationIsLiteral) {
060 this(base);
061 this.locationIsLiteral = locationIsLiteral;
062 }
063
064 public Iterator<? extends InputSequence<?>> readInputData(String location) throws IOException {
065 throw new UnsupportedOperationException();
066 }
067
068 public List<? extends TrainingSequence<?>> readTrainingData(String location) throws IOException {
069 return readTrainingData(location, false);
070 }
071
072 public List<? extends TrainingSequence<?>> readTrainingData(String location, boolean predict) throws IOException {
073 // XXX: Predict semantics are unusual here
074 Reader reader = locationIsLiteral ? new StringReader(location) : new FileReader(new File(location));
075 BufferedReader r = new BufferedReader(reader);
076 List<TrainingSequence<?>> ret = new ArrayList<TrainingSequence<?>>();
077 try {
078 while(r.ready()) {
079 int[] data = hiddenStateReader.readSequence(r);
080 if(data == null) {
081 break;
082 }
083 InputSequence<?> inputSeq = null;
084
085 Map<String, InputSequence<?>> seq = new HashMap<String, InputSequence<?>>();
086 for(InterleavedInputComponent comp : components) {
087 boolean success = comp.read(r, seq);
088 Assert.a(success == true, "Not all components of a composite input sequence were present.");
089 }
090 if(singleComponent) {
091 Assert.a(seq.size() == 1);
092 Map.Entry<String, InputSequence<?>> entry = seq.entrySet().iterator().next();
093 componentName = entry.getKey();
094 inputSeq = entry.getValue();
095 }
096 else {
097 inputSeq = new InputSequenceComposite(seq);
098 }
099 ret.add(new TrainingSequence(inputSeq, data));
100 }
101 }
102 finally {
103 r.close();
104 }
105 return ret;
106 }
107
108 public void writeInputData(String location, Iterator<? extends InputSequence<?>> data) throws IOException {
109 throw new UnsupportedOperationException();
110 }
111
112 public void writeTrainingData(String location, List<? extends TrainingSequence<?>> data) throws IOException {
113 BufferedWriter w = new BufferedWriter(new FileWriter(new File(location)));
114 try {
115 for(TrainingSequence<?> seq : data) {
116 hiddenStateReader.writeSequence(w, seq.getY());
117 if(singleComponent) {
118 Map<String, InputSequence<?>> componentSeqs = new HashMap<String, InputSequence<?>>();
119 componentSeqs.put(componentName, seq.getInputSequence());
120 for(InterleavedInputComponent comp : components) {
121 comp.write(w, componentSeqs);
122 }
123 }
124 else {
125 InputSequenceComposite compSeq = (InputSequenceComposite) seq.getInputSequence();
126 Map<String, InputSequence<?>> componentSeqs = compSeq.getComponents();
127 for(InterleavedInputComponent comp : components) {
128 comp.write(w, componentSeqs);
129 }
130 }
131 }
132 }
133 finally {
134 w.close();
135 }
136 }
137
138 /** gets the current set of input components configured for this input handler.
139 * @return returns the interleaved input components that make up the file.
140 */
141 public List<InterleavedInputComponent> getComponents() {
142 return components;
143 }
144
145 /** sets the current set of input components configured for this input handler.
146 * @param components sets the interleaved input components that make up the file.
147 */
148 public void setComponents(List<InterleavedInputComponent> components) {
149 this.components = components;
150 }
151
152 /** gets the meaning of the input location string.
153 * @return true to indicate whether the input data will come in as a file
154 * or through the location string.
155 */
156 public boolean isLocationIsLiteral() {
157 return locationIsLiteral;
158 }
159
160 /** sets the meaning of the input location string.
161 * @param literal set locationIsLiteral to indicate whether the input data will come in as a file
162 * or through the location string.
163 */
164 public void setLocationIsLiteral(boolean literal) {
165 this.locationIsLiteral = literal;
166 }
167
168 /** gets the reader used to read in results for training data.
169 * @return the {@link TrainingSequenceIO} used to read in the hidden sequences for training
170 */
171 public IntInput getHiddenStateReader() {
172 return hiddenStateReader;
173 }
174
175 /** sets the reader used to get hidden sequences. Must be set to read in training data.
176 * @param hiddenStateReader the reader that will be used to access hidden states
177 */
178 public void setHiddenStateReader(IntInput hiddenStateReader) {
179 this.hiddenStateReader = hiddenStateReader;
180 }
181 }