001 package calhoun.analysis.crf.test;
002
003 import java.io.File;
004 import java.util.ArrayList;
005 import java.util.List;
006
007 import org.apache.commons.logging.Log;
008 import org.apache.commons.logging.LogFactory;
009
010 import calhoun.analysis.crf.Conrad;
011 import calhoun.analysis.crf.executables.InputSequenceSubsetter;
012 import calhoun.analysis.crf.executables.PartitionRandomlyTrainTestFiles;
013 import calhoun.analysis.crf.io.TrainingSequence;
014 import calhoun.util.AbstractTestCase;
015 import calhoun.util.FileUtil;
016
017 public class CRFIOTest extends AbstractTestCase {
018 private static final Log log = LogFactory.getLog(CRFIOTest.class);
019 boolean debug = log.isDebugEnabled();
020
021
022 public void testArrayReaderInt() throws Exception {
023
024 String fileName = "test/input/test_tabbed_array_reader_int.txt";
025 String[][] s = FileUtil.readFlatFile(fileName);
026
027 assertEquals(s[0][0],"14");
028 assertEquals(s[0][1],"276470");
029 }
030
031 public void testInputSequenceReader() throws Exception {
032 String configFile = "test/input/test_subsetting_configfile.xml";
033 String inputFile = "test/input/test_subsetting_inputfile.txt";
034 String outputFile = "test/working/test_subsetting_output.txt";
035
036 // Define a model manager from file
037 Conrad c = new Conrad(configFile);
038
039 // Read a list of inputsequences from file, of the type expected by model manager cm
040 List<? extends TrainingSequence<?>> t = c.getInputHandler().readTrainingData(inputFile);
041 assertEquals(t.size(),1);
042 assertEquals(t.get(0).length(),7560);
043
044 // Take subsets of the first of the InputSequences you read
045 List<TrainingSequence<?>> s = new ArrayList<TrainingSequence<?>>();
046 s.add(t.get(0).subSequence(1,10));
047 s.add(t.get(0).subSequence(1,20));
048
049 // Write these subsetted InputSequences to file; they should be of exactly the same type as before.
050 c.getInputHandler().writeTrainingData(outputFile, s);
051
052 // NOT AUTOMATABLE: Inspect the file by hand make sure it is what you think.
053
054 // Can the file we just wrote be read in again?
055 List<? extends TrainingSequence<?>> u = c.getInputHandler().readTrainingData(outputFile);
056 assertEquals(2, u.size());
057 assertEquals(10, u.get(0).length());
058 assertEquals(20, u.get(1).length());
059 }
060
061 public void testInputSequenceFileReader() throws Exception {
062 String configFile = "test/input/inputFilesTest/test_subsetting_configfile.xml";
063 String configFile2 = "test/input/inputFilesTest/test_subsetting_configfile2.xml";
064 String inputFile = "test/input/inputFilesTest";
065 String outputFile = "test/working/test_file_subsetting_output.txt";
066 String matchFile = "test/output/test_file_subsetting_output.txt";
067
068 // Define a model manager from file
069 Conrad c = new Conrad(configFile);
070
071 // Read a list of inputsequences from file, of the type expected by model manager cm
072 List<? extends TrainingSequence<?>> t = c.getInputHandler().readTrainingData(inputFile);
073 assertEquals(2, t.size());
074 assertEquals(7560, t.get(0).length());
075
076 // Take subsets of the first of the InputSequences you read
077 List<TrainingSequence<?>> s = new ArrayList<TrainingSequence<?>>();
078 s.add(t.get(0).subSequence(1,10));
079 s.add(t.get(0).subSequence(1,20));
080 s.add(t.get(1).subSequence(1,10));
081
082 // Write these subsetted InputSequences to file; they should be of exactly the same type as before.
083 Conrad c2 = new Conrad(configFile2);
084 c2.getInputHandler().writeTrainingData(outputFile, s);
085
086 assertFilesMatch(matchFile, outputFile);
087 }
088
089
090 public void testInputSequenceSubsetterCommandLine() throws Exception {
091 String configFile = "test/input/test_subsetting_configfile.xml";
092 String inputFile = "test/input/test_subsetting_inputfile.txt";
093 String regionsFile = "test/input/test_subsetting_regions.txt";
094 String outputFile = "test/working/test_subsetting_output.txt";
095
096 // Use a command line program to take specified subset regions
097 InputSequenceSubsetter.main(new String[] {configFile,inputFile,regionsFile,"5",outputFile,"0"});
098
099 // make sure you get what you expected
100 Conrad c = new Conrad(configFile);
101 List<? extends TrainingSequence<?>> u = c.getInputHandler().readTrainingData(outputFile);
102 assertEquals(u.size(),2);
103 assertEquals(u.get(0).length(),15);
104 assertEquals(u.get(1).length(),30);
105 }
106
107 public void testInputSequenceSubsetterCommandLineSeparateFiles() throws Exception {
108 String configFile = "test/input/inputFilesTest/test_subsetting_configfile.xml";
109 String inputFile = "test/input/inputFilesTest";
110 String regionsFile = "test/input/test_subsetting_regions.txt";
111 String outputFile = "test/working/test_subsetting_output_files";
112
113 File f = new File(outputFile);
114 f.mkdirs();
115
116 // Use a command line program to take specified subset regions
117 InputSequenceSubsetter.main(new String[] {configFile,inputFile,regionsFile,"5",outputFile,"0"});
118
119 // make sure you get what you expected
120 assertFilesMatch("test/working/test_subsetting_output_files/hidden.dat", "test/output/test_subsetting_output_files/hidden.dat");
121 assertFilesMatch("test/working/test_subsetting_output_files/ref.dat", "test/output/test_subsetting_output_files/ref.dat");
122 assertFilesMatch("test/working/test_subsetting_output_files/name.dat", "test/output/test_subsetting_output_files/name.dat");
123 assertFilesMatch("test/working/test_subsetting_output_files/aln.dat", "test/output/test_subsetting_output_files/aln.dat");
124 }
125
126 public void testInputSequenceSubsetterCommandLineForcingGenic() throws Exception {
127 String configFile = "test/input/test_subsetting_configfile.xml";
128 String inputFile = "test/input/test_subsetting_inputfile.txt";
129 String regionsFile = "test/input/test_subsetting_regions.txt";
130 String outputFile = "test/working/test_subsetting_output.txt";
131
132 // Use a command line program to take specified subset regions
133 InputSequenceSubsetter.main(new String[] {configFile,inputFile,regionsFile,"5",outputFile,"1"});
134
135 // make sure you get what you expected
136 Conrad c = new Conrad(configFile);
137 List<? extends TrainingSequence<?>> u = c.getInputHandler().readTrainingData(outputFile);
138 assertEquals(u.size(),0);
139 }
140
141
142 public void testTrainTestSplitCommandLine() throws Exception {
143 String configFile = "test/input/test_subsetting_configfile.xml";
144 String inputFile = "test/input/test_traintest_split.txt";
145 String nTrainArg = "1";
146 String outputTrain = "test/working/test_subsetting_output.txt";
147 String outputTest = "test/working/test_subsetting_output.txt";
148
149 // Use a command line program to take specified subset regions
150 PartitionRandomlyTrainTestFiles.main(new String[] {configFile,inputFile,nTrainArg,outputTrain,outputTest});
151
152 // make sure you get what you expected
153 Conrad c = new Conrad(configFile);
154 List<? extends TrainingSequence<?>> u = c.getInputHandler().readTrainingData(outputTest);
155 assertEquals(u.size(),2);
156 }
157
158 }