001    package calhoun.analysis.crf.io;
002    
003    import java.io.BufferedReader;
004    import java.io.BufferedWriter;
005    import java.io.File;
006    import java.io.FileReader;
007    import java.io.FileWriter;
008    import java.io.IOException;
009    import java.io.Reader;
010    import java.io.StringReader;
011    import java.util.ArrayList;
012    import java.util.Collections;
013    import java.util.HashMap;
014    import java.util.Iterator;
015    import java.util.List;
016    import java.util.Map;
017    
018    import calhoun.util.Assert;
019    
020    /** an {@link InputHandler} for handling input files that consist of multiple different sequences
021     * interleaved together in a file.  This input handler has a list of {@link InterleavedInputComponent}s.
022     * When reading in a file, this InputHandler opens a reader on the file and passes the reader to each
023     * {@link InterleavedInputComponent} in turn for each sequence.  Training data is assumed to be the first 
024     * line of each sequence, using an {@link IntInput} to encode the hidden states.
025     * <p>
026     * This input handler is useful for test data that contains multiple inputs in a file along with the training data.
027     * It is included to support backwards compatibility with the old input format.
028     * <p>
029     * This input handler can also work with "literal" input, where the location string that is passed in is not a file
030     * name, but the actual input data.  This is used frequently to pass small volumes of data in unit tests.
031     */
032    public class InputHandlerInterleaved implements InputHandler {
033            private static final long serialVersionUID = -2969140424776995686L;
034            
035            List<InterleavedInputComponent> components;
036            IntInput hiddenStateReader = new IntInput();
037            boolean locationIsLiteral = false;
038            boolean singleComponent = false;
039            String componentName;
040    
041            /** creates a new input handler, usually to be configured from an XML file
042             */
043            public InputHandlerInterleaved() {
044            }
045            
046            /** creates a new input handler, containing a single {@link InterleavedInputComponent}
047             * @param base the single input component which is contained in the input file
048             */
049            public InputHandlerInterleaved(InterleavedInputComponent base) {
050                    this.singleComponent = true;
051                    components = Collections.singletonList(base);
052            }
053            
054            /** creates a new input handler, containing a single {@link InterleavedInputComponent}
055             * @param base the single input component which is contained in the input file
056             * @param locationIsLiteral if true then the location string passed in to the read commands
057             * is the actual input data.  Otherwise, it is the location of a file from which to read the data.
058             */
059            public InputHandlerInterleaved(InterleavedInputComponent base, boolean locationIsLiteral) {
060                    this(base);
061                    this.locationIsLiteral = locationIsLiteral;
062            }
063            
064            public Iterator<? extends InputSequence<?>> readInputData(String location) throws IOException {
065                    throw new UnsupportedOperationException();
066            }
067            
068            public List<? extends TrainingSequence<?>> readTrainingData(String location) throws IOException {
069                    return readTrainingData(location, false);
070            }
071            
072            public List<? extends TrainingSequence<?>> readTrainingData(String location, boolean predict) throws IOException {
073                    // XXX: Predict semantics are unusual here
074                    Reader reader = locationIsLiteral ? new StringReader(location) : new FileReader(new File(location)); 
075                    BufferedReader r = new BufferedReader(reader);
076                    List<TrainingSequence<?>> ret = new ArrayList<TrainingSequence<?>>();
077                    try {
078                            while(r.ready()) {
079                                    int[] data = hiddenStateReader.readSequence(r);
080                                    if(data == null) {
081                                            break;
082                                    }
083                                    InputSequence<?> inputSeq = null;
084                                    
085                                    Map<String, InputSequence<?>> seq = new HashMap<String, InputSequence<?>>();
086                                    for(InterleavedInputComponent comp : components) {
087                                            boolean success = comp.read(r, seq);
088                                            Assert.a(success == true, "Not all components of a composite input sequence were present.");
089                                    }
090                                    if(singleComponent) {
091                                            Assert.a(seq.size() == 1);
092                                            Map.Entry<String, InputSequence<?>> entry = seq.entrySet().iterator().next();
093                                            componentName = entry.getKey();
094                                            inputSeq = entry.getValue();
095                                    }
096                                    else {
097                                            inputSeq = new InputSequenceComposite(seq);
098                                    }
099                                    ret.add(new TrainingSequence(inputSeq, data));
100                            }
101                    }
102                    finally {
103                            r.close();
104                    }
105                    return ret;
106            }
107    
108            public void writeInputData(String location, Iterator<? extends InputSequence<?>> data) throws IOException {
109                    throw new UnsupportedOperationException();
110            }
111    
112            public void writeTrainingData(String location, List<? extends TrainingSequence<?>> data) throws IOException {
113                    BufferedWriter w = new BufferedWriter(new FileWriter(new File(location)));
114                    try {
115                            for(TrainingSequence<?> seq : data) {
116                                    hiddenStateReader.writeSequence(w, seq.getY());
117                                    if(singleComponent) {
118                                            Map<String, InputSequence<?>> componentSeqs = new HashMap<String, InputSequence<?>>();
119                                            componentSeqs.put(componentName, seq.getInputSequence());
120                                            for(InterleavedInputComponent comp : components) {
121                                                    comp.write(w, componentSeqs);
122                                            }
123                                    }
124                                    else {
125                                            InputSequenceComposite compSeq = (InputSequenceComposite) seq.getInputSequence();
126                                            Map<String, InputSequence<?>> componentSeqs = compSeq.getComponents();
127                                            for(InterleavedInputComponent comp : components) {
128                                                    comp.write(w, componentSeqs);
129                                            }
130                                    }
131                            }
132                    }
133                    finally {
134                            w.close();
135                    }
136            }
137    
138            /** gets the current set of input components configured for this input handler.
139             * @return returns the interleaved input components that make up the file.
140             */
141            public List<InterleavedInputComponent> getComponents() {
142                    return components;
143            }
144    
145            /** sets the current set of input components configured for this input handler.
146             * @param components sets the interleaved input components that make up the file.
147             */
148            public void setComponents(List<InterleavedInputComponent> components) {
149                    this.components = components;
150            }
151    
152            /** gets the meaning of the input location string.
153             * @return true to indicate whether the input data will come in as a file
154             * or through the location string.
155             */
156            public boolean isLocationIsLiteral() {
157                    return locationIsLiteral;
158            }
159    
160            /** sets the meaning of the input location string.
161             * @param literal set locationIsLiteral to indicate whether the input data will come in as a file
162             * or through the location string.
163             */
164            public void setLocationIsLiteral(boolean literal) {
165                    this.locationIsLiteral = literal;
166            }
167            
168            /** gets the reader used to read in results for training data.
169             * @return the {@link TrainingSequenceIO} used to read in the hidden sequences for training
170             */
171            public IntInput getHiddenStateReader() {
172                    return hiddenStateReader;
173            }
174    
175            /** sets the reader used to get hidden sequences.  Must be set to read in training data.
176             * @param hiddenStateReader the reader that will be used to access hidden states
177             */
178            public void setHiddenStateReader(IntInput hiddenStateReader) {
179                    this.hiddenStateReader = hiddenStateReader;
180            }       
181    }