001    package calhoun.analysis.crf.test;
002    
003    import java.io.File;
004    import java.util.ArrayList;
005    import java.util.List;
006    
007    import org.apache.commons.logging.Log;
008    import org.apache.commons.logging.LogFactory;
009    
010    import calhoun.analysis.crf.Conrad;
011    import calhoun.analysis.crf.executables.InputSequenceSubsetter;
012    import calhoun.analysis.crf.executables.PartitionRandomlyTrainTestFiles;
013    import calhoun.analysis.crf.io.TrainingSequence;
014    import calhoun.util.AbstractTestCase;
015    import calhoun.util.FileUtil;
016    
017    public class CRFIOTest extends AbstractTestCase {
018            private static final Log log = LogFactory.getLog(CRFIOTest.class);
019            boolean debug = log.isDebugEnabled();
020            
021            
022            public void testArrayReaderInt() throws Exception {
023    
024                    String fileName = "test/input/test_tabbed_array_reader_int.txt";                
025                    String[][] s = FileUtil.readFlatFile(fileName);
026                    
027                    assertEquals(s[0][0],"14");
028                    assertEquals(s[0][1],"276470");                 
029            }
030    
031            public void testInputSequenceReader() throws Exception {
032                    String configFile = "test/input/test_subsetting_configfile.xml";
033                    String inputFile = "test/input/test_subsetting_inputfile.txt";
034                    String outputFile = "test/working/test_subsetting_output.txt";
035                    
036                    // Define a model manager from file
037                    Conrad c = new Conrad(configFile);
038                    
039                    // Read a list of inputsequences from file, of the type expected by model manager cm
040                    List<? extends TrainingSequence<?>> t = c.getInputHandler().readTrainingData(inputFile);
041                    assertEquals(t.size(),1);
042                    assertEquals(t.get(0).length(),7560);
043                    
044                    // Take subsets of the first of the InputSequences you read
045                    List<TrainingSequence<?>> s = new ArrayList<TrainingSequence<?>>();
046                    s.add(t.get(0).subSequence(1,10));
047                    s.add(t.get(0).subSequence(1,20));
048                    
049                    // Write these subsetted InputSequences to file; they should be of exactly the same type as before.
050                    c.getInputHandler().writeTrainingData(outputFile, s);
051    
052                    // NOT AUTOMATABLE: Inspect the file by hand make sure it is what you think.
053                    
054                    // Can the file we just wrote be read in again?
055                    List<? extends TrainingSequence<?>> u = c.getInputHandler().readTrainingData(outputFile);
056                    assertEquals(2, u.size());
057                    assertEquals(10, u.get(0).length());
058                    assertEquals(20, u.get(1).length());
059            }
060            
061            public void testInputSequenceFileReader() throws Exception {
062                    String configFile = "test/input/inputFilesTest/test_subsetting_configfile.xml";
063                    String configFile2 = "test/input/inputFilesTest/test_subsetting_configfile2.xml";
064                    String inputFile = "test/input/inputFilesTest";
065                    String outputFile = "test/working/test_file_subsetting_output.txt";
066                    String matchFile = "test/output/test_file_subsetting_output.txt";
067                    
068                    // Define a model manager from file
069                    Conrad c = new Conrad(configFile);
070                    
071                    // Read a list of inputsequences from file, of the type expected by model manager cm
072                    List<? extends TrainingSequence<?>> t = c.getInputHandler().readTrainingData(inputFile);
073                    assertEquals(2, t.size());
074                    assertEquals(7560, t.get(0).length());
075                    
076                    // Take subsets of the first of the InputSequences you read
077                    List<TrainingSequence<?>> s = new ArrayList<TrainingSequence<?>>();
078                    s.add(t.get(0).subSequence(1,10));
079                    s.add(t.get(0).subSequence(1,20));
080                    s.add(t.get(1).subSequence(1,10));
081                    
082                    // Write these subsetted InputSequences to file; they should be of exactly the same type as before.
083                    Conrad c2 = new Conrad(configFile2);
084                    c2.getInputHandler().writeTrainingData(outputFile, s);
085    
086                    assertFilesMatch(matchFile, outputFile);
087            }
088            
089            
090            public void testInputSequenceSubsetterCommandLine() throws Exception {
091                    String configFile = "test/input/test_subsetting_configfile.xml";
092                    String inputFile = "test/input/test_subsetting_inputfile.txt";
093                    String regionsFile = "test/input/test_subsetting_regions.txt";
094                    String outputFile = "test/working/test_subsetting_output.txt";
095                    
096                    // Use a command line program to take specified subset regions
097                    InputSequenceSubsetter.main(new String[] {configFile,inputFile,regionsFile,"5",outputFile,"0"});
098                    
099                    // make sure you get what you expected
100                    Conrad c = new Conrad(configFile);
101                    List<? extends TrainingSequence<?>> u = c.getInputHandler().readTrainingData(outputFile);
102                    assertEquals(u.size(),2);
103                    assertEquals(u.get(0).length(),15);
104                    assertEquals(u.get(1).length(),30);
105            }
106            
107            public void testInputSequenceSubsetterCommandLineSeparateFiles() throws Exception {
108                    String configFile = "test/input/inputFilesTest/test_subsetting_configfile.xml";
109                    String inputFile = "test/input/inputFilesTest";
110                    String regionsFile = "test/input/test_subsetting_regions.txt";
111                    String outputFile = "test/working/test_subsetting_output_files";
112                    
113                    File f = new File(outputFile);
114                    f.mkdirs();
115                    
116                    // Use a command line program to take specified subset regions
117                    InputSequenceSubsetter.main(new String[] {configFile,inputFile,regionsFile,"5",outputFile,"0"});
118                    
119                    // make sure you get what you expected
120                    assertFilesMatch("test/working/test_subsetting_output_files/hidden.dat", "test/output/test_subsetting_output_files/hidden.dat");
121                    assertFilesMatch("test/working/test_subsetting_output_files/ref.dat", "test/output/test_subsetting_output_files/ref.dat");
122                    assertFilesMatch("test/working/test_subsetting_output_files/name.dat", "test/output/test_subsetting_output_files/name.dat");
123                    assertFilesMatch("test/working/test_subsetting_output_files/aln.dat", "test/output/test_subsetting_output_files/aln.dat");
124            }
125            
126            public void testInputSequenceSubsetterCommandLineForcingGenic() throws Exception {
127                    String configFile = "test/input/test_subsetting_configfile.xml";
128                    String inputFile = "test/input/test_subsetting_inputfile.txt";
129                    String regionsFile = "test/input/test_subsetting_regions.txt";
130                    String outputFile = "test/working/test_subsetting_output.txt";
131                    
132                    // Use a command line program to take specified subset regions
133                    InputSequenceSubsetter.main(new String[] {configFile,inputFile,regionsFile,"5",outputFile,"1"});
134                    
135                    // make sure you get what you expected
136                    Conrad c = new Conrad(configFile);
137                    List<? extends TrainingSequence<?>> u = c.getInputHandler().readTrainingData(outputFile);
138                    assertEquals(u.size(),0);
139            }
140            
141            
142            public void testTrainTestSplitCommandLine() throws Exception {
143                    String configFile = "test/input/test_subsetting_configfile.xml";
144                    String inputFile = "test/input/test_traintest_split.txt";
145                    String nTrainArg = "1";
146                    String outputTrain = "test/working/test_subsetting_output.txt";
147                    String outputTest = "test/working/test_subsetting_output.txt";          
148                    
149                    // Use a command line program to take specified subset regions
150                    PartitionRandomlyTrainTestFiles.main(new String[] {configFile,inputFile,nTrainArg,outputTrain,outputTest});
151                    
152                    // make sure you get what you expected
153                    Conrad c = new Conrad(configFile);
154                    List<? extends TrainingSequence<?>> u = c.getInputHandler().readTrainingData(outputTest);
155                    assertEquals(u.size(),2);
156            }
157            
158    }