001    package calhoun.analysis.crf.io;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileWriter;
005    import java.io.IOException;
006    import java.io.Serializable;
007    import java.io.Writer;
008    import java.util.ArrayList;
009    import java.util.Iterator;
010    import java.util.List;
011    import java.util.Set;
012    
013    import org.apache.commons.logging.Log;
014    import org.apache.commons.logging.LogFactory;
015    
016    import calhoun.analysis.crf.ModelManager;
017    import calhoun.analysis.crf.io.IntervalInputSequence.IntervalRangeMapValue;
018    import calhoun.analysis.crf.statistics.PredictedActualBinaryContingencyTable;
019    import calhoun.util.Assert;
020    import calhoun.util.DenseBooleanMatrix2D;
021    import calhoun.util.FileUtil;
022    import calhoun.util.RangeMap;
023    
024    /** A legacy output handler that computes basic stats, gene calling statsm and then writes out a GTF file.
025     */
026    public class OutputHandlerGeneCallStats implements OutputHandler {
027            private static final long serialVersionUID = -1506791895658464225L;
028            private static final Log log = LogFactory.getLog(OutputHandlerGeneCallStats.class);
029    
030            private ModelManager manager;
031            private InputHandler inputHandler;
032            String location;
033            boolean writeTrainingData = false;
034    
035            /** default constructor.  <code>ModelManager</code> and <code>InputHandler</code> must be configured separately. */
036            public OutputHandlerGeneCallStats() {
037            }
038            
039            /** creates an output handler using this model and input handler
040             * @param manager the model used for gene calling.  Used for calculating stats.
041             * @param inputHandler input handler which will be used for writing out the input sequence with the results.
042             */
043            public OutputHandlerGeneCallStats(ModelManager manager, InputHandler inputHandler) {
044                    this.inputHandler = inputHandler;
045                    setManager(manager);
046            }
047            
048            public void setOutputLocation(String location) {
049                    this.location = location;
050            }
051            
052            public void writeOutput(InputSequence<?> sequence, int[] hiddenStates) throws IOException {
053                    throw new UnsupportedOperationException();
054            }
055    
056            public void writeTestOutput(InputSequence<?> sequence, int[] truePath, int[] hiddenStates) throws IOException {
057                    calcResultIncrement(new TrainingSequence(sequence, truePath), hiddenStates);
058            }
059    
060            public void outputComplete() throws IOException {
061                    if(location != null) {
062                            if(writeTrainingData) {
063                                    try {
064                                            inputHandler.writeTrainingData(location, labeled);
065                                    }
066                                    catch(Exception ex) {
067                                            log.warn("Unable to write training data", ex);
068                                    }
069                            }
070                            writeGTF(labeled, location + ".gtf");
071                            System.out.print(this);
072                            writeResults(location + ".dat");
073                    }
074            }
075    
076            /** retursn the input handler used to write out the input sequences
077             * @return the inputHandler which will be used to write out the input sequences
078             */
079            public InputHandler getInputHandler() {
080                    return inputHandler;
081            }
082    
083            /** sets the inputHandler used to write out the input sequences
084             * @param inputHandler the inputHandler used to write out the input sequences
085             */
086            public void setInputHandler(InputHandler inputHandler) {
087                    this.inputHandler = inputHandler;
088            }
089    
090            /** gets the model used to generate results
091             * @return the model used to generate results
092             */
093            public ModelManager getManager() {
094                    return manager;
095            }
096    
097            /** sets the model used to generate results
098             * @param manager the model used to generate results
099             */
100            public void setManager(ModelManager manager) {
101                    this.manager = manager;
102    
103                    ctCodingNucleotide = new PredictedActualBinaryContingencyTable();
104                            
105                    ctExons = new PredictedActualBinaryContingencyTable();
106                    ctExons.forgetTN();
107                    
108                    nStates = manager.getNumStates();
109                    ctStates = new ArrayList<PredictedActualBinaryContingencyTable>();
110                    for (int i=0; i<nStates; i++) {
111                            ctStates.add(new PredictedActualBinaryContingencyTable());
112                    }
113    
114                    DenseBooleanMatrix2D LT = manager.getLegalTransitions();
115                    fromInd = new ArrayList<Integer>();
116                    toInd   = new ArrayList<Integer>();
117                    for (int from=0; from<nStates; from++) {
118                            for (int to=0; to<nStates; to++ ) {
119                                    if (LT.getQuick(from,to)) {
120                                            fromInd.add(from);
121                                            toInd.add(to);
122                                    }
123                            }
124                    }
125                    nTransitions = fromInd.size();
126                    ctTransitions = new ArrayList<PredictedActualBinaryContingencyTable>();
127                    for (int i=0; i<nTransitions; i++) {
128                            ctTransitions.add(new PredictedActualBinaryContingencyTable());
129                    }
130            }
131                            
132            private List<TrainingSequence<?>> labeled = new ArrayList<TrainingSequence<?>>();
133            private int correct = 0;
134            private int incorrect = 0;
135            private int perfect = 0;
136            private int imperfect = 0;
137            private transient double[] viterbiScores;
138            private double lla = 0, llv = 0;
139    
140            // Info we'll need to know about the model in order to be clever about gathering stats
141            // (the manager must be provided, other things derived from it)
142            private int nStates;
143            private int nTransitions;
144            private List<Integer> fromInd;
145            private List<Integer> toInd;;
146            
147            // The 2x2 contingency tables for which we'll keep track of results:
148            private PredictedActualBinaryContingencyTable ctCodingNucleotide;
149            private PredictedActualBinaryContingencyTable ctExons;  
150            private List<PredictedActualBinaryContingencyTable> ctStates;
151            private List<PredictedActualBinaryContingencyTable> ctTransitions;
152            
153            /** returns the exact nucleotide accuracy of the result */
154            public float getAccuracy() {
155                    return correct / (float)(correct+incorrect);
156            }
157    
158            public static class Results implements Serializable { 
159                    private static final long serialVersionUID = 9082449588200635355L;
160                    public PredictedActualBinaryContingencyTable ctCodingNucleotide;
161                    public PredictedActualBinaryContingencyTable ctExons;   
162                    public List<PredictedActualBinaryContingencyTable> ctStates;
163                    public List<PredictedActualBinaryContingencyTable> ctTransitions;
164                    public int correct;
165                    public int incorrect;
166                    public int perfect;
167                    public int imperfect;
168            }
169            
170            void writeResults(String loc) throws IOException {
171                    Results results = new Results();
172                    results.ctCodingNucleotide = ctCodingNucleotide;
173                    results.ctExons = ctExons;
174                    results.ctStates = ctStates;
175                    results.ctTransitions = ctTransitions;
176                    results.correct = correct;
177                    results.incorrect = incorrect;
178                    results.perfect = perfect;
179                    results.imperfect = imperfect;
180                    FileUtil.writeObject(loc, results);
181            }
182            
183            @Override
184            public String toString() {
185                    String ret = "NOTE: If you're using the CRF for prediction and pass in a dummy (e.g. all zeros) hidden\n";
186                    ret += "  sequence, then many of the following statistics will not be meaningful\n";
187                    
188                    for (int s=0; s<nStates; s++) {
189                            ret += "[State=" + manager.getStateName(s) + "] ";
190                            ctStates.get(s).freeze();
191                            ret += ctStates.get(s).summarize();
192                            ret += "\n";
193                    }
194                    
195                    for (int t=0; t<nTransitions; t++) {
196                            ret += "[Transition " + manager.getStateName(fromInd.get(t)) + " --> " + manager.getStateName(toInd.get(t)) + " ] ";
197                            ctTransitions.get(t).freeze();
198                            ret += ctTransitions.get(t).summarize();
199                            ret += "\n";
200                    }               
201    
202                    ctCodingNucleotide.freeze();
203                    ret += "[Coding nucleotides] " + ctCodingNucleotide.summarize() + "\n";
204                    
205                    ctExons.freeze();
206                    ret += "[Coding exons] " + ctExons.summarize() + "\n";          
207                    
208                    if (lla>0) {
209                            ret += "LLA:" + lla + "  LLV:" + llv + "  " + "\n";
210                    }
211                    
212                    ret += String.format("Perfectly predicted hidden sequences: %d/%d %.2f %%",perfect,perfect+imperfect,perfect*100.0/(float) (perfect+imperfect))+ "\n";
213                    
214                    ret += String.format("Nucleotide Hidden State Agreement: %d/%d %.2f %%",correct, correct + incorrect, correct * 100.0 / (float) (correct + incorrect)) + "\n";
215    
216                    
217                    return ret;
218            }
219    
220            /** calculates statstics and output for results on a given test sequence */
221            public void calcResultIncrement(TrainingSequence training, int[] predictedHiddenSequence) {
222                    labeled.add(new TrainingSequence(training.getInputSequence(), predictedHiddenSequence));  // This is only place that labelled gets added to???
223                    // So I guess the results just get built up incrementally, both the actuall hidden sequences and the stats?
224                    Assert.a(training.length() == predictedHiddenSequence.length);
225                    int[] actualHiddenSequence = new int[training.length()];
226                    for (int i=0; i<training.length(); i++) {
227                            actualHiddenSequence[i] = training.getY(i);
228                    }
229                    boolean thisperfect = true;
230                    for (int i = 0; i < predictedHiddenSequence.length; ++i) {
231                            int predY = predictedHiddenSequence[i];
232                            int realY = actualHiddenSequence[i];
233    
234                            if (realY == predY) {    correct += 1; } else { incorrect += 1; thisperfect = false; }
235                            
236                            ctCodingNucleotide.increment(isCodingPlus(predY),isCodingPlus(realY));
237                            ctCodingNucleotide.increment(isCodingMinus(predY),isCodingMinus(realY));
238                            
239                            for (int s=0; s<nStates; s++) {
240                                    ctStates.get(s).increment((predY==s),(realY==s));
241                            }       
242                    }
243                    if (thisperfect) {
244                            perfect++;
245                    } else {
246                            imperfect++;
247                    }
248                    for (int i = 1; i < predictedHiddenSequence.length; ++i) {
249                            int predY = predictedHiddenSequence[i];
250                            int realY = actualHiddenSequence[i];
251                            int predYp = predictedHiddenSequence[i-1];
252                            int realYp = actualHiddenSequence[i-1];                 
253    
254                            for (int t=0; t<nTransitions; t++) {
255                                    boolean bPred = ( (predYp==fromInd.get(t)) && (predY==toInd.get(t)) );
256                                    boolean bReal = ( (realYp==fromInd.get(t)) && (realY==toInd.get(t)) );
257                                    ctTransitions.get(t).increment(  bPred  ,  bReal  );
258                            }       
259                    }
260    
261                    // Now let's increment the contingency table for exons; note that here not counting TN's
262                    RangeMap predExonsPlus = new RangeMap();
263                    RangeMap predExonsMinus = new RangeMap();
264                    RangeMap realExonsPlus = new RangeMap();
265                    RangeMap realExonsMinus = new RangeMap();
266                    makeExonRangeMapFrom13SV(predictedHiddenSequence,predExonsPlus,predExonsMinus);
267                    makeExonRangeMapFrom13SV(actualHiddenSequence,realExonsPlus,realExonsMinus);
268                    incrementCTFromRangeMaps(ctExons,predExonsPlus,realExonsPlus);
269                    incrementCTFromRangeMaps(ctExons,predExonsMinus,realExonsMinus);
270            
271            }
272    
273            private void incrementCTFromRangeMaps(PredictedActualBinaryContingencyTable ct, RangeMap pred, RangeMap real) {
274                    // By looping through the predictions, can get at TP and FP
275                    Set<IntervalRangeMapValue> pv = pred.values();
276                    Iterator<IntervalRangeMapValue> pvi = pv.iterator();
277                    while(pvi.hasNext()) {
278                            IntervalRangeMapValue irmv = pvi.next();
279                            Set<IntervalRangeMapValue> vals = real.find(irmv.start,irmv.end);
280                            if(vals.size() == 0) {
281                                    ct.incrementFP();
282                            }
283                            else {
284                                    IntervalRangeMapValue val = vals.iterator().next();
285                                    if(val.start == irmv.start && val.end == irmv.end) {
286                                            ct.incrementTP();
287                                    } else {
288                                            ct.incrementFP();
289                                    }
290                            }
291                    }
292                    Set<IntervalRangeMapValue> rv = real.values();
293                    Iterator<IntervalRangeMapValue> rvi = rv.iterator();
294                    while(rvi.hasNext()) {
295                            IntervalRangeMapValue irmv = rvi.next();
296                            if (!pred.hasEntry(irmv.start,irmv.end)) {
297                                    ct.incrementFN();
298                            }
299                            Set<IntervalRangeMapValue> vals = pred.find(irmv.start,irmv.end);
300                            if(vals.size() == 0) {
301                                    ct.incrementFN();
302                            }
303                            else {
304                                    IntervalRangeMapValue val = vals.iterator().next();
305                                    if(val.start == irmv.start && val.end == irmv.end) {
306                                    } else {
307                                            ct.incrementFN();
308                                    }
309                            }
310                    }       
311            }
312    
313    
314            private void makeExonRangeMapFrom13SV(int[] hidden, RangeMap exonsPlus, RangeMap exonsMinus) {
315                    
316                    int len = hidden.length;
317                    
318                    for (int i=1; i<len; i++) {
319                            if ((!isCodingPlus(hidden[i-1]) && (isCodingPlus(hidden[i])))) {
320                                    int j=i;
321                                    while ((isCodingPlus(hidden[j])) &&(j<(len-1))) { j++; }
322                                    exonsPlus.add(i,j,new IntervalRangeMapValue(i,j,1.0));
323                                    //log.info("Add + "+i+" "+j);
324                            }
325                            if ((!isCodingMinus(hidden[i-1]) && (isCodingMinus(hidden[i])))) {
326                                    int j=i;
327                                    while ((isCodingMinus(hidden[j])) &&(j<len-1)) { j++; }
328                                    exonsMinus.add(i,j,new IntervalRangeMapValue(i,j,1.0));
329                                    //log.info("Add - "+i+" "+j);
330                            }
331                    }
332            }
333    
334            private boolean isCodingPlus(int y) {
335                    Assert.a( (y>=0) && (y<13) );
336                    if ( (y==1) || (y==2) || (y==3) ) { return true; }
337                    return false;
338            }
339    
340            private boolean isCodingMinus(int y) {
341                    Assert.a( (y>=0) && (y<13) );
342                    if ( (y==7) || (y==8) || (y==9) ) { return true; }
343                    return false;
344            }
345            
346            public void loglikelihoodIncrement(double logLikelihoodActual, double logLikelihoodViterbi) {
347                    lla += logLikelihoodActual;
348                    llv += logLikelihoodViterbi;
349            }
350    
351            public TrainingSequence getLabeled(int i) {
352                    return labeled.get(i);
353            }
354            
355            String seqName;
356            String genePrefix;
357            long   offset;
358            
359            // This function converts a 13 state model hidden sequence to a GTF file.  
360            public void writeGTF(List<? extends TrainingSequence<?>> refStates, String filename) throws IOException
361            {               
362                    int ref, geneNum, seqCount, frame=-1;
363                    long i, exonStart, exonEnd, end;
364                    boolean inPlusExon, inMinusExon, firstExon, startCodonSplit;
365                    String strand;
366                    Writer fout = new BufferedWriter(new FileWriter(filename));     
367                    exonStart = exonEnd = 0;
368                    geneNum = 1;
369                    seqCount = 0;
370                    
371                    // Determine if model is tricycle13 or interval13.
372                    boolean interval13 = false;
373                    int prevState, state;
374                    for (TrainingSequence seq : refStates) {
375                            if (seq.length() == 0)   continue;
376                            
377                            prevState = seq.getY(0);
378                            for (i=1; i<seq.length(); i++) {
379                                    state = seq.getY((int)i);
380                                    if (prevState == 0 && (state==2 || state==3 || state==7 || state==8)) {
381                                            interval13 = true;
382                                            break;
383                                    }
384                                    prevState = state;
385                            }
386                            if (interval13)
387                                    break;
388                    }
389                    
390                    for (TrainingSequence seq : refStates) {        
391                            
392                            if (interval13) {
393                                    SequenceConverter.convertSeqFromInterval13ToTricycle13(seq);
394                            }
395                            
396                            inPlusExon  = false;
397                            inMinusExon = false;
398                            firstExon   = true;
399                            startCodonSplit = false;
400                            
401                            parseSeqName(seq, seqCount);
402                                                                            
403                            for (i=0; i<seq.length(); i++)
404                            {                       
405                                    ref = seq.getY((int)i);
406                                    
407                                    if (ref == 1 || ref == 2 || ref == 3)           // in a plus exon
408                                    {
409                                            if (!inPlusExon)
410                                            {
411                                                    exonStart = i+1;
412                                                    inPlusExon = true;
413                                                    frame = setFrame(ref);
414                                            }
415                                    }
416                                    else if (ref == 7 || ref == 8 || ref == 9)      // in a minus exon
417                                    {
418                                            if (!inMinusExon)
419                                            {
420                                                    exonStart = i+1;
421                                                    inMinusExon = true;
422                                                    frame = setFrame(ref);
423                                                    if (firstExon) {
424                                                            if (i < 3)
425                                                                    System.err.println("Minus strand gene start is within 3 nucleotides of sequence start.  No stop codon writen to GTF for gene starting at position " + (exonStart+offset));
426                                                            else
427                                                                    writeGFTLine(fout,seqName,"stop_codon",exonStart+offset-3,exonStart+offset-1,"-",frame,genePrefix,geneNum);                                             
428            
429                                                    }
430                                            }
431                                    }
432                                    else if ( inPlusExon  && (ref == 4  || ref == 5  || ref == 6) ) { // just ended an exon on plus strand, now in a plus intron
433                                            strand = "+";   
434                                            inPlusExon = false;
435                                            exonEnd = i;
436                                            if (firstExon) {
437                                                    if (exonEnd - exonStart + 1 < 3)     { end = exonEnd + offset; startCodonSplit = true;}
438                                                    else                                                            { end = exonStart+offset+2; }
439                                                    writeGFTLine(fout,seqName,"start_codon",exonStart+offset,end,strand,frame,genePrefix,geneNum);                                          
440                                                    firstExon = false;
441                                            }
442                                            else if (startCodonSplit) {     // at second exon that contains part of start codon
443                                                    Assert.a(frame==1 || frame==2);
444                                                    writeGFTLine(fout,seqName,"start_codon",exonStart+offset,exonStart+offset+frame-1,strand,frame,genePrefix,geneNum);                                             
445                                                    startCodonSplit = false;
446                                            }
447                                            writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,strand,frame,genePrefix,geneNum);
448                                    }
449                                    else if (inMinusExon && (ref == 10 || ref == 11 || ref == 12))  { // just ended an exon on minus strand, now in a minus intron
450                                            strand = "-";
451                                            inMinusExon = false;
452                                            firstExon = false;
453                                            exonEnd = i;
454                                            writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,strand,frame,genePrefix,geneNum);
455                                    }
456                                    else                                                            // now in intergenic region
457                                    {
458                                            boolean write = true;
459                                            if (inPlusExon)                                 // was in gene at previous nucleotide
460                                            {
461                                                    strand = "+";
462                                                    exonEnd = i;                    
463                                                    if (firstExon) {
464                                                            if (exonEnd - exonStart + 1 < 3) {
465                                                                    System.err.println("Single '" + strand + "' strand exon is < 3 bases for sequence '" + seqName + "'.  exonStart=" + exonStart + "  exonEnd=" + exonEnd);
466                                                                    write = false;
467                                                            }
468                                                            else {
469                                                                    writeGFTLine(fout,seqName,"start_codon",exonStart+offset,exonStart+offset+2,strand,frame,genePrefix,geneNum);
470                                                            }
471                                                    }
472                                                    else if (startCodonSplit) {     // at second exon that contains part of start codon
473                                                            Assert.a(frame==1 || frame==2);
474                                                            writeGFTLine(fout,seqName,"start_codon",exonStart+offset,exonStart+offset+frame-1,strand,frame,genePrefix,geneNum);                                             
475                                                    }
476                                                    if (write) {
477                                                            writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,  strand,frame,genePrefix,geneNum);
478                                                            writeGFTLine(fout,seqName,"stop_codon",exonEnd+offset+1,exonEnd+offset+3,strand,0,    genePrefix,geneNum);
479                                                    }
480                                                    inPlusExon  = false;
481                                                    firstExon   = true;
482                                                    startCodonSplit = false;
483                                                    geneNum++;
484                                            }
485                                            else if (inMinusExon) {
486                                                    strand = "-";
487                                                    long prevExonEnd = exonEnd;
488                                                    exonEnd = i;                    
489                                                    if (firstExon && exonEnd - exonStart + 1 < 3) {
490                                                            System.err.println("Single '" + strand + "' strand exon is < 3 bases for sequence '" + seqName + "'.  exonStart=" + exonStart + "  exonEnd=" + exonEnd);
491                                                    }
492                                                    else if (exonEnd - exonStart + 1 < 3) {      // this exon is < 3 bases, need to split start codon
493                                                            if (exonEnd - exonStart + 1 == 2) { // this exon is 2 bases
494                                                                    writeGFTLine(fout,seqName,"start_codon",prevExonEnd+offset,prevExonEnd+offset,strand,0,genePrefix,geneNum);                             
495                                                                    writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,strand,frame,genePrefix,geneNum);
496                                                                    writeGFTLine(fout,seqName,"start_codon",exonStart+offset,exonEnd+offset,strand,2,genePrefix,geneNum);                           
497                                                            }
498                                                            else if (exonEnd - exonStart + 1 == 1)  { // this exon is 1 base
499                                                                    writeGFTLine(fout,seqName,"start_codon",prevExonEnd+offset-1,prevExonEnd+offset,strand,0,genePrefix,geneNum);                           
500                                                                    writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,strand,frame,genePrefix,geneNum);
501                                                                    writeGFTLine(fout,seqName,"start_codon",exonStart+offset,exonEnd+offset,strand,1,genePrefix,geneNum);                                                                                           
502                                                            }
503                                                    }
504                                                    else {
505                                                            writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,strand,frame,genePrefix,geneNum);
506                                                            writeGFTLine(fout,seqName,"start_codon",exonEnd+offset-2,exonEnd+offset,strand,0,    genePrefix,geneNum);                               
507                                                    }                       
508                                                    inMinusExon = false;
509                                                    firstExon   = true;
510                                                    startCodonSplit = false;
511                                                    geneNum++;                      
512                                            }
513                                    }
514                            }
515                            seqCount++;
516                    }
517                    fout.close();
518            }
519            
520            private void parseSeqName(TrainingSequence seq, int seqNum) {
521                    NameInputSequence nameInput = null;
522    
523                    InputSequence<?> inputSeq = seq.getInputSequence();
524                    if(inputSeq instanceof InputSequenceComposite) {
525                            nameInput = (NameInputSequence) inputSeq.getComponent("name");
526                            
527                    }
528                    if(nameInput == null) {
529                            log.debug("Sequence name not specified.  Setting sequence name to 'SEQ_" + String.valueOf(seqNum) + "'");
530                            seqName    = "SEQ_" + String.valueOf(seqNum);   // Create a name and return.
531                            genePrefix = "SEQ_" + String.valueOf(seqNum);
532                            offset = 0;
533                            return;
534                    }
535                    String name =  nameInput.getName().trim();
536                    
537                    int colon1, colon2, numColons;
538                    
539                    if (name.startsWith("group:") || name.startsWith("seq:") ) {
540                            numColons = numOccurrences(name, ':');
541                            if (numColons == 1) {
542                                    colon1 = name.indexOf(":");
543                                    seqName = name;
544                                    genePrefix = name.substring(colon1 + 1, name.length());
545                                    offset = 0;
546                                    return;
547                            }
548                            else if (numColons == 2) {
549                                    colon1 = name.indexOf(":");
550                                    colon2 = name.lastIndexOf(":");
551                                    seqName = name.substring(0, colon2);
552                                    genePrefix = name.substring(colon1 + 1, colon2);
553                                    int pound = genePrefix.indexOf("#");
554                                    if (pound > 0) {
555                                            genePrefix = genePrefix.substring(0, pound);
556                                    }
557                                    setOffset(name.substring(colon2+1, name.length()));
558                                    return;
559                            }
560                    }
561                    log.debug("Sequence name is in unexpected format.  Setting offset=0 and sequence name='" + name + "'.");
562                    seqName    = name;
563                    genePrefix = name;
564                    offset = 0;
565            }
566            
567            // Returns the number of times the character 'c' occurs in 'str'
568            private static int numOccurrences(String str, char c) {
569                    int num = 0;
570                    int index = str.indexOf(c);
571                    while (index != -1) {
572                            num++;
573                            index = str.indexOf(c, index+1);
574                    }
575                    return num;
576            }
577    
578            private void setOffset(String str) {
579                    int numDashes, dash;
580                    numDashes = numOccurrences(str, '-');
581                    
582                    if (numDashes == 0) {
583                            offset = 0;
584                    }
585                    else if (numDashes == 1) {
586                            try {
587                                    dash = str.indexOf("-");
588                                    offset = Long.valueOf(str.substring(0, dash)) - 1;
589                            }
590                            catch (NumberFormatException e) {
591                                    System.err.println("Sequence range values in unexpected format.  Setting offset=0 for sequence='" + seqName + "'.");
592                                    offset = 0;
593                            }
594                    }
595                    else {
596                            System.err.println("Sequence range values in unexpected format.  Setting offset=0 for sequence='" + seqName + "'.");
597                            offset = 0;
598                    }
599            }
600            
601            // Frame is the nmber of bases in this region befor you get in frame.  
602            // That is, if frame is 0, the first three bases in this element are a codon.
603            // If frame is 1, the first base is the end of a codon hanging over from the 
604            //     end of the previous codon and the next three are the first codon in this feature.
605            // If frame is 2, the first two bases are the end of the previous codon and the 
606            //     next three are the first codon in this feature.
607            private static int setFrame(int ref) {
608                    int frame = -1;
609                    
610                    switch (ref) {
611                    case 1:  frame = 0;  break;
612                    case 2:  frame = 2;  break;
613                    case 3:  frame = 1;  break;
614                    case 7:  frame = 1;  break;
615                    case 8:  frame = 2;  break;
616                    case 9:  frame = 0;  break;
617                    default:  Assert.a(false, "Error setting frame, ref = ", ref);
618                    }
619                    return frame;
620            }
621    
622            // Outputs one line to the GTF file.  
623            // NOTE:  source is assumed to be 'CONRAD', and score is assumed to be unknown and set to '.'.
624            private static void writeGFTLine(Writer out, String seqName, String feature, long exonStart, long exonEnd, 
625                                                                             String strand, int frame, String genePrefix, int geneNum) throws IOException {
626    
627                    Assert.a(frame==0 || frame==1 || frame==2, "Frame value invalid, frame = ", frame);
628                    
629                    String geneId = genePrefix + "G_" + String.valueOf(geneNum);
630                    String transId = genePrefix + "T_" + String.valueOf(geneNum) + ".1";
631                    
632                    out.write(seqName + "\t" + "CONRAD" + "\t" + feature + "\t" + exonStart + "\t" + exonEnd + "\t" +
633                                      "." + "\t" + strand + "\t" + frame + "\t" + 
634                                      "gene_id \"" + geneId + "\"; transcript_id \"" + transId + "\";\n");  
635            }
636    
637            public double[] getViterbiScores() {
638                    return viterbiScores;
639            }
640    
641            /**
642             * @return Returns the writeTrainingData.
643             */
644            public boolean isWriteTrainingData() {
645                    return writeTrainingData;
646            }
647    
648            /**
649             * @param writeTrainingData The writeTrainingData to set.
650             */
651            public void setWriteTrainingData(boolean writeTrainingData) {
652                    this.writeTrainingData = writeTrainingData;
653            }
654    
655    }