001 package calhoun.analysis.crf.test;
002
003 import java.util.List;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.Conrad;
009 import calhoun.analysis.crf.io.InputHandler;
010 import calhoun.analysis.crf.io.InputHandlerInterleaved;
011 import calhoun.analysis.crf.io.OutputHandlerGeneCallStats;
012 import calhoun.analysis.crf.io.StringInput;
013 import calhoun.analysis.crf.io.TrainingSequence;
014 import calhoun.util.AbstractTestCase;
015
016 public class GTFWriterTest extends AbstractTestCase {
017 private static final Log log = LogFactory.getLog(CacheTest.class);
018 boolean debug = log.isDebugEnabled();
019
020 public void testWriteGTF() throws Exception {
021 Conrad c = new Conrad("test/input/configGTF.xml");
022
023 // Test Tricycle13 model
024 String gtfFile = "test/working/shortTrain.gtf";
025 InputHandler ih = new InputHandlerInterleaved(new StringInput());
026 List<? extends TrainingSequence<?>> data = ih.readTrainingData("test/input/shortTrain.tricycle13.txt");
027 OutputHandlerGeneCallStats oh = new OutputHandlerGeneCallStats(c.getModel(), ih);
028 oh.writeGTF(data, gtfFile);
029 assertFilesMatch("test/output/shortTrain.gtf", gtfFile);
030
031 gtfFile = "test/working/testGTF.gtf";
032 List<? extends TrainingSequence<?>> data2 = c.getInputHandler().readTrainingData("test/input/testGTF.txt");
033 oh = new OutputHandlerGeneCallStats(c.getModel(), c.getInputHandler());
034 oh.writeGTF(data2, gtfFile);
035 assertFilesMatch("test/output/testGTF.gtf", gtfFile);
036
037 // Test Interval13 model.
038 gtfFile = "test/working/shortTrainInterval13.gtf";
039 List<? extends TrainingSequence<?>> data3 = ih.readTrainingData("test/input/interval13/data/shortTrain.interval13.txt");
040 oh = new OutputHandlerGeneCallStats(c.getModel(), ih);
041 oh.writeGTF(data3, gtfFile);
042 assertFilesMatch("test/output/shortTrain.gtf", gtfFile);
043 }
044 }
045
046
047 /** Below is an explanation of the fields in a GTF file:
048 * 1. SEQNAME - The name of the sequence. Typically a chomosome or a contig.
049 * 2. SOURCE - The program that generated this feature.
050 * 3. FEATURE - The name of this type of feature. I.e. "CDS", "start_codon", "stop_codon".
051 * 4. START - The starting position of the feature in the sequence. The first base is 1.
052 * 5. END - The ending position of the feature (inclusive).
053 * 6. SCORE - A score between 0 and 1000. If ther is no score value, enter ".".
054 * 7. STRAND - Valid entries include "+", "-", "." (for don't know).
055 * 8. FRAME - A number between 0-2 (inclusive) that represents the reading frame of the
056 * first base.
057 * 9. GROUPING ATTRIBUTES - Attribute keys and values.
058 *
059 * More information on frames:
060 * Frame is the number of bases in this region before you get in frame. That is, if it is 0,
061 * the first three bases in this element are a codon. If it's 1, the first base is the end
062 * of a codon hanging over from the last exon, and the next three are the first codon. If
063 * it's 2, the first two bases are the end of the previous codon, and the next three are the
064 * first codon in this feature. The first exon in each + stranded transcript has a frame of 0
065 * and the rest can vary all over the place.
066 *
067 * Note: In this test the hidden sequence is used to generate the GTF file, the nucleotide sequence
068 * is not used.
069 */
070