001    package calhoun.analysis.crf.test;
002    
003    import java.util.List;
004    
005    import org.apache.commons.logging.Log;
006    import org.apache.commons.logging.LogFactory;
007    
008    import calhoun.analysis.crf.Conrad;
009    import calhoun.analysis.crf.io.InputHandler;
010    import calhoun.analysis.crf.io.InputHandlerInterleaved;
011    import calhoun.analysis.crf.io.OutputHandlerGeneCallStats;
012    import calhoun.analysis.crf.io.StringInput;
013    import calhoun.analysis.crf.io.TrainingSequence;
014    import calhoun.util.AbstractTestCase;
015    
016    public class GTFWriterTest extends AbstractTestCase {
017            private static final Log log = LogFactory.getLog(CacheTest.class);
018            boolean debug = log.isDebugEnabled();
019            
020            public void testWriteGTF() throws Exception {
021                    Conrad c = new Conrad("test/input/configGTF.xml");
022    
023                    // Test Tricycle13 model
024                    String gtfFile = "test/working/shortTrain.gtf";
025                    InputHandler ih = new InputHandlerInterleaved(new StringInput());
026                    List<? extends TrainingSequence<?>> data = ih.readTrainingData("test/input/shortTrain.tricycle13.txt");
027                    OutputHandlerGeneCallStats oh = new OutputHandlerGeneCallStats(c.getModel(), ih);
028                    oh.writeGTF(data, gtfFile);
029                    assertFilesMatch("test/output/shortTrain.gtf", gtfFile);
030                    
031                    gtfFile = "test/working/testGTF.gtf";
032                    List<? extends TrainingSequence<?>> data2 = c.getInputHandler().readTrainingData("test/input/testGTF.txt");
033                    oh = new OutputHandlerGeneCallStats(c.getModel(), c.getInputHandler());
034                    oh.writeGTF(data2, gtfFile);
035                    assertFilesMatch("test/output/testGTF.gtf", gtfFile);
036    
037                    // Test Interval13 model.
038                    gtfFile = "test/working/shortTrainInterval13.gtf";
039                    List<? extends TrainingSequence<?>> data3 = ih.readTrainingData("test/input/interval13/data/shortTrain.interval13.txt");
040                    oh = new OutputHandlerGeneCallStats(c.getModel(), ih);
041                    oh.writeGTF(data3, gtfFile);
042                    assertFilesMatch("test/output/shortTrain.gtf", gtfFile);        
043            }
044    }
045    
046    
047    /** Below is an explanation of the fields in a GTF file:
048     *              1.  SEQNAME - The name of the sequence.  Typically a chomosome or a contig.
049     *              2.  SOURCE  - The program that generated this feature.
050     *              3.  FEATURE - The name of this type of feature.  I.e. "CDS", "start_codon", "stop_codon".
051     *              4.  START   - The starting position of the feature in the sequence.  The first base is 1.
052     *              5.  END     - The ending position of the feature (inclusive).
053     *              6.  SCORE   - A score between 0 and 1000.  If ther is no score value, enter ".".
054     *              7.  STRAND  - Valid entries include "+", "-", "." (for don't know).
055     *              8.  FRAME   - A number between 0-2 (inclusive) that represents the reading frame of the
056     *                                        first base.
057     *              9.  GROUPING ATTRIBUTES - Attribute keys and values.
058     * 
059     *  More information on frames:
060     *    Frame is the number of bases in this region before you get in frame.  That is, if it is 0,
061     *    the first three bases in this element are a codon.  If it's 1, the first base is the end 
062     *    of a codon hanging over from the last exon, and the next three are the first codon.  If 
063     *    it's 2, the first two bases are the end of the previous codon, and the next three are the 
064     *    first codon in this feature.  The first exon in each + stranded transcript has a frame of 0
065     *    and the rest can vary all over the place.
066     *    
067     *  Note:  In this test the hidden sequence is used to generate the GTF file, the nucleotide sequence
068     *    is not used.
069     */
070