001    package calhoun.analysis.crf.io;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileWriter;
005    import java.io.IOException;
006    import java.io.Serializable;
007    import java.io.Writer;
008    import java.util.ArrayList;
009    import java.util.Iterator;
010    import java.util.List;
011    import java.util.Set;
012    
013    import org.apache.commons.logging.Log;
014    import org.apache.commons.logging.LogFactory;
015    
016    import calhoun.analysis.crf.ModelManager;
017    import calhoun.analysis.crf.io.IntervalInputSequence.IntervalRangeMapValue;
018    import calhoun.analysis.crf.statistics.PredictedActualBinaryContingencyTable;
019    import calhoun.util.Assert;
020    import calhoun.util.DenseBooleanMatrix2D;
021    import calhoun.util.FileUtil;
022    import calhoun.util.RangeMap;
023    
024    /** A legacy output handler that computes basic stats, gene calling statsm and then writes out a GTF file.
025     */
026    public class OutputHandlerGeneCallPredict implements OutputHandler {
027            private static final long serialVersionUID = 2014487490985409134L;
028    
029            private static final Log log = LogFactory.getLog(OutputHandlerGeneCallPredict.class);
030    
031            private ModelManager manager;
032            private InputHandler inputHandler;
033            String location;
034            boolean writeTrainingData = false;
035    
036            /** default constructor.  <code>ModelManager</code> and <code>InputHandler</code> must be configured separately. */
037            public OutputHandlerGeneCallPredict() {
038            }
039            
040            /** creates an output handler using this model and input handler
041             * @param manager the model used for gene calling.  Used for calculating stats.
042             * @param inputHandler input handler which will be used for writing out the input sequence with the results.
043             */
044            public OutputHandlerGeneCallPredict(ModelManager manager, InputHandler inputHandler) {
045                    this.inputHandler = inputHandler;
046                    setManager(manager);
047            }
048            
049            public void setOutputLocation(String location) {
050                    this.location = location;
051            }
052            
053            public void writeOutput(InputSequence<?> sequence, int[] hiddenStates) throws IOException {
054                    throw new UnsupportedOperationException();
055            }
056    
057            public void writeTestOutput(InputSequence<?> sequence, int[] truePath, int[] hiddenStates) throws IOException {
058                    calcResultIncrement(new TrainingSequence(sequence, truePath), hiddenStates);
059            }
060    
061            public void outputComplete() throws IOException {
062                    if(location != null) {
063                            if(writeTrainingData) {
064                                    try {
065                                            inputHandler.writeTrainingData(location, labeled);
066                                    }
067                                    catch(Exception ex) {
068                                            log.warn("Unable to write training data", ex);
069                                    }
070                            }
071                            writeGTF(labeled, location + ".gtf");
072                            System.out.print(this);
073                            writeResults(location + ".dat");
074                    }
075            }
076    
077            /** retursn the input handler used to write out the input sequences
078             * @return the inputHandler which will be used to write out the input sequences
079             */
080            public InputHandler getInputHandler() {
081                    return inputHandler;
082            }
083    
084            /** sets the inputHandler used to write out the input sequences
085             * @param inputHandler the inputHandler used to write out the input sequences
086             */
087            public void setInputHandler(InputHandler inputHandler) {
088                    this.inputHandler = inputHandler;
089            }
090    
091            /** gets the model used to generate results
092             * @return the model used to generate results
093             */
094            public ModelManager getManager() {
095                    return manager;
096            }
097    
098            /** sets the model used to generate results
099             * @param manager the model used to generate results
100             */
101            public void setManager(ModelManager manager) {
102                    this.manager = manager;
103    
104                    ctCodingNucleotide = new PredictedActualBinaryContingencyTable();
105                            
106                    ctExons = new PredictedActualBinaryContingencyTable();
107                    ctExons.forgetTN();
108                    
109                    nStates = manager.getNumStates();
110                    ctStates = new ArrayList<PredictedActualBinaryContingencyTable>();
111                    for (int i=0; i<nStates; i++) {
112                            ctStates.add(new PredictedActualBinaryContingencyTable());
113                    }
114    
115                    DenseBooleanMatrix2D LT = manager.getLegalTransitions();
116                    fromInd = new ArrayList<Integer>();
117                    toInd   = new ArrayList<Integer>();
118                    for (int from=0; from<nStates; from++) {
119                            for (int to=0; to<nStates; to++ ) {
120                                    if (LT.getQuick(from,to)) {
121                                            fromInd.add(from);
122                                            toInd.add(to);
123                                    }
124                            }
125                    }
126                    nTransitions = fromInd.size();
127                    ctTransitions = new ArrayList<PredictedActualBinaryContingencyTable>();
128                    for (int i=0; i<nTransitions; i++) {
129                            ctTransitions.add(new PredictedActualBinaryContingencyTable());
130                    }
131            }
132                            
133            private List<TrainingSequence<?>> labeled = new ArrayList<TrainingSequence<?>>();
134            private int correct = 0;
135            private int incorrect = 0;
136            private int perfect = 0;
137            private int imperfect = 0;
138            private transient double[] viterbiScores;
139            private double lla = 0, llv = 0;
140    
141            // Info we'll need to know about the model in order to be clever about gathering stats
142            // (the manager must be provided, other things derived from it)
143            private int nStates;
144            private int nTransitions;
145            private List<Integer> fromInd;
146            private List<Integer> toInd;;
147            
148            // The 2x2 contingency tables for which we'll keep track of results:
149            private PredictedActualBinaryContingencyTable ctCodingNucleotide;
150            private PredictedActualBinaryContingencyTable ctExons;  
151            private List<PredictedActualBinaryContingencyTable> ctStates;
152            private List<PredictedActualBinaryContingencyTable> ctTransitions;
153            
154            /** returns the exact nucleotide accuracy of the result */
155            public float getAccuracy() {
156                    return correct / (float)(correct+incorrect);
157            }
158    
159            public static class Results implements Serializable { 
160                    private static final long serialVersionUID = 9082449588200635355L;
161                    public PredictedActualBinaryContingencyTable ctCodingNucleotide;
162                    public PredictedActualBinaryContingencyTable ctExons;   
163                    public List<PredictedActualBinaryContingencyTable> ctStates;
164                    public List<PredictedActualBinaryContingencyTable> ctTransitions;
165                    public int correct;
166                    public int incorrect;
167                    public int perfect;
168                    public int imperfect;
169            }
170            
171            void writeResults(String loc) throws IOException {
172                    Results results = new Results();
173                    results.ctCodingNucleotide = ctCodingNucleotide;
174                    results.ctExons = ctExons;
175                    results.ctStates = ctStates;
176                    results.ctTransitions = ctTransitions;
177                    results.correct = correct;
178                    results.incorrect = incorrect;
179                    results.perfect = perfect;
180                    results.imperfect = imperfect;
181                    FileUtil.writeObject(loc, results);
182            }
183            
184            @Override
185            public String toString() {
186                    String ret = "";
187                    
188                    for (int s=0; s<nStates; s++) {
189                            ret += "[State=" + manager.getStateName(s) + "] ";
190                            ctStates.get(s).freeze();
191                            ret += "Predicted: " + ctStates.get(s).pp();
192                            ret += "\n";
193                    }
194                    
195                    for (int t=0; t<nTransitions; t++) {
196                            ret += "[Transition " + manager.getStateName(fromInd.get(t)) + " --> " + manager.getStateName(toInd.get(t)) + " ] ";
197                            ctTransitions.get(t).freeze();
198                            ret += "Predicted: " + ctTransitions.get(t).pp();
199                            ret += "\n";
200                    }               
201    
202                    ctCodingNucleotide.freeze();
203                    ret += "[Coding nucleotides] Predicted: " + ctCodingNucleotide.pp() + "\n";
204                    
205                    ctExons.freeze();
206                    ret += "[Coding exons] Predicted: " + ctExons.pp() + "\n";              
207                    
208                    if (lla>0) {
209                            ret += "LLA:" + lla + "  LLV:" + llv + "  " + "\n";
210                    }
211                    
212                    //ret += String.format("Perfectly predicted hidden sequences: %d/%d %.2f %%",perfect,perfect+imperfect,perfect*100.0/(float) (perfect+imperfect))+ "\n";
213                    
214                    //ret += String.format("Nucleotide Hidden State Agreement: %d/%d %.2f %%",correct, correct + incorrect, correct * 100.0 / (float) (correct + incorrect)) + "\n";
215    
216                    
217                    return ret;
218            }
219            
220            /** calculates statstics and output for results on a given test sequence */
221            public void calcResultIncrement(TrainingSequence training, int[] predictedHiddenSequence) {
222                    labeled.add(new TrainingSequence(training.getInputSequence(), predictedHiddenSequence));  // This is only place that labelled gets added to???
223                    // So I guess the results just get built up incrementally, both the actuall hidden sequences and the stats?
224                    Assert.a(training.length() == predictedHiddenSequence.length);
225                    int[] actualHiddenSequence = new int[training.length()];
226                    for (int i=0; i<training.length(); i++) {
227                            actualHiddenSequence[i] = training.getY(i);
228                    }
229                    boolean thisperfect = true;
230                    for (int i = 0; i < predictedHiddenSequence.length; ++i) {
231                            int predY = predictedHiddenSequence[i];
232                            int realY = actualHiddenSequence[i];
233    
234                            if (realY == predY) {    correct += 1; } else { incorrect += 1; thisperfect = false; }
235                            
236                            ctCodingNucleotide.increment(isCodingPlus(predY),isCodingPlus(realY));
237                            ctCodingNucleotide.increment(isCodingMinus(predY),isCodingMinus(realY));
238                            
239                            for (int s=0; s<nStates; s++) {
240                                    ctStates.get(s).increment((predY==s),(realY==s));
241                            }       
242                    }
243                    if (thisperfect) {
244                            perfect++;
245                    } else {
246                            imperfect++;
247                    }
248                    for (int i = 1; i < predictedHiddenSequence.length; ++i) {
249                            int predY = predictedHiddenSequence[i];
250                            int realY = actualHiddenSequence[i];
251                            int predYp = predictedHiddenSequence[i-1];
252                            int realYp = actualHiddenSequence[i-1];                 
253    
254                            for (int t=0; t<nTransitions; t++) {
255                                    boolean bPred = ( (predYp==fromInd.get(t)) && (predY==toInd.get(t)) );
256                                    boolean bReal = ( (realYp==fromInd.get(t)) && (realY==toInd.get(t)) );
257                                    ctTransitions.get(t).increment(  bPred  ,  bReal  );
258                            }       
259                    }
260    
261                    // Now let's increment the contingency table for exons; note that here not counting TN's
262                    RangeMap predExonsPlus = new RangeMap();
263                    RangeMap predExonsMinus = new RangeMap();
264                    RangeMap realExonsPlus = new RangeMap();
265                    RangeMap realExonsMinus = new RangeMap();
266                    makeExonRangeMapFrom13SV(predictedHiddenSequence,predExonsPlus,predExonsMinus);
267                    makeExonRangeMapFrom13SV(actualHiddenSequence,realExonsPlus,realExonsMinus);
268                    incrementCTFromRangeMaps(ctExons,predExonsPlus,realExonsPlus);
269                    incrementCTFromRangeMaps(ctExons,predExonsMinus,realExonsMinus);
270            
271            }
272    
273            private void incrementCTFromRangeMaps(PredictedActualBinaryContingencyTable ct, RangeMap pred, RangeMap real) {
274                    // By looping through the predictions, can get at TP and FP
275                    Set<IntervalRangeMapValue> pv = pred.values();
276                    Iterator<IntervalRangeMapValue> pvi = pv.iterator();
277                    while(pvi.hasNext()) {
278                            IntervalRangeMapValue irmv = pvi.next();
279                            Set<IntervalRangeMapValue> vals = real.find(irmv.start,irmv.end);
280                            if(vals.size() == 0) {
281                                    ct.incrementFP();
282                            }
283                            else {
284                                    IntervalRangeMapValue val = vals.iterator().next();
285                                    if(val.start == irmv.start && val.end == irmv.end) {
286                                            ct.incrementTP();
287                                    } else {
288                                            ct.incrementFP();
289                                    }
290                            }
291                    }
292                    Set<IntervalRangeMapValue> rv = real.values();
293                    Iterator<IntervalRangeMapValue> rvi = rv.iterator();
294                    while(rvi.hasNext()) {
295                            IntervalRangeMapValue irmv = rvi.next();
296                            if (!pred.hasEntry(irmv.start,irmv.end)) {
297                                    ct.incrementFN();
298                            }
299                            Set<IntervalRangeMapValue> vals = pred.find(irmv.start,irmv.end);
300                            if(vals.size() == 0) {
301                                    ct.incrementFN();
302                            }
303                            else {
304                                    IntervalRangeMapValue val = vals.iterator().next();
305                                    if(val.start == irmv.start && val.end == irmv.end) {
306                                    } else {
307                                            ct.incrementFN();
308                                    }
309                            }
310                    }       
311            }
312    
313    
314            private void makeExonRangeMapFrom13SV(int[] hidden, RangeMap exonsPlus, RangeMap exonsMinus) {
315                    
316                    int len = hidden.length;
317                    
318                    for (int i=1; i<len; i++) {
319                            if ((!isCodingPlus(hidden[i-1]) && (isCodingPlus(hidden[i])))) {
320                                    int j=i;
321                                    while ((isCodingPlus(hidden[j])) &&(j<(len-1))) { j++; }
322                                    exonsPlus.add(i,j,new IntervalRangeMapValue(i,j,1.0));
323                                    //log.info("Add + "+i+" "+j);
324                            }
325                            if ((!isCodingMinus(hidden[i-1]) && (isCodingMinus(hidden[i])))) {
326                                    int j=i;
327                                    while ((isCodingMinus(hidden[j])) &&(j<len-1)) { j++; }
328                                    exonsMinus.add(i,j,new IntervalRangeMapValue(i,j,1.0));
329                                    //log.info("Add - "+i+" "+j);
330                            }
331                    }
332            }
333    
334            private boolean isCodingPlus(int y) {
335                    Assert.a( (y>=0) && (y<13) );
336                    if ( (y==1) || (y==2) || (y==3) ) { return true; }
337                    return false;
338            }
339    
340            private boolean isCodingMinus(int y) {
341                    Assert.a( (y>=0) && (y<13) );
342                    if ( (y==7) || (y==8) || (y==9) ) { return true; }
343                    return false;
344            }
345            
346            public void loglikelihoodIncrement(double logLikelihoodActual, double logLikelihoodViterbi) {
347                    lla += logLikelihoodActual;
348                    llv += logLikelihoodViterbi;
349            }
350    
351            public TrainingSequence getLabeled(int i) {
352                    return labeled.get(i);
353            }
354            
355            String seqName;
356            String genePrefix;
357            long   offset;
358            
359            // This function converts a 13 state model hidden sequence to a GTF file.  
360            public void writeGTF(List<? extends TrainingSequence<?>> refStates, String filename) throws IOException
361            {               
362                    int ref, geneNum, seqCount, frame=-1;
363                    long i, exonStart, exonEnd, end;
364                    boolean inPlusExon, inMinusExon, firstExon, startCodonSplit;
365                    String strand;
366                    Writer fout = new BufferedWriter(new FileWriter(filename));     
367                    exonStart = exonEnd = 0;
368                    geneNum = 1;
369                    seqCount = 0;
370                    
371                    // Determine if model is tricycle13 or interval13.
372                    boolean interval13 = false;
373                    int prevState, state;
374                    for (TrainingSequence seq : refStates) {
375                            if (seq.length() == 0)   continue;
376                            
377                            prevState = seq.getY(0);
378                            for (i=1; i<seq.length(); i++) {
379                                    state = seq.getY((int)i);
380                                    if (prevState == 0 && (state==2 || state==3 || state==7 || state==8)) {
381                                            interval13 = true;
382                                            break;
383                                    }
384                                    prevState = state;
385                            }
386                            if (interval13)
387                                    break;
388                    }
389                    
390                    for (TrainingSequence seq : refStates) {        
391                            
392                            if (interval13) {
393                                    SequenceConverter.convertSeqFromInterval13ToTricycle13(seq);
394                            }
395                            
396                            inPlusExon  = false;
397                            inMinusExon = false;
398                            firstExon   = true;
399                            startCodonSplit = false;
400                            
401                            parseSeqName(seq, seqCount);
402                                                                            
403                            for (i=0; i<seq.length(); i++)
404                            {                       
405                                    ref = seq.getY((int)i);
406                                    
407                                    if (ref == 1 || ref == 2 || ref == 3)           // in a plus exon
408                                    {
409                                            if (!inPlusExon)
410                                            {
411                                                    exonStart = i+1;
412                                                    inPlusExon = true;
413                                                    frame = setFrame(ref);
414                                            }
415                                    }
416                                    else if (ref == 7 || ref == 8 || ref == 9)      // in a minus exon
417                                    {
418                                            if (!inMinusExon)
419                                            {
420                                                    exonStart = i+1;
421                                                    inMinusExon = true;
422                                                    frame = setFrame(ref);
423                                                    if (firstExon) {
424                                                            if (i < 3)
425                                                                    System.err.println("Minus strand gene start is within 3 nucleotides of sequence start.  No stop codon writen to GTF for gene starting at position " + (exonStart+offset));
426                                                            else
427                                                                    writeGFTLine(fout,seqName,"stop_codon",exonStart+offset-3,exonStart+offset-1,"-",frame,genePrefix,geneNum);                                             
428            
429                                                    }
430                                            }
431                                    }
432                                    else if ( inPlusExon  && (ref == 4  || ref == 5  || ref == 6) ) { // just ended an exon on plus strand, now in a plus intron
433                                            strand = "+";   
434                                            inPlusExon = false;
435                                            exonEnd = i;
436                                            if (firstExon) {
437                                                    if (exonEnd - exonStart + 1 < 3)     { end = exonEnd + offset; startCodonSplit = true;}
438                                                    else                                                            { end = exonStart+offset+2; }
439                                                    writeGFTLine(fout,seqName,"start_codon",exonStart+offset,end,strand,frame,genePrefix,geneNum);                                          
440                                                    firstExon = false;
441                                            }
442                                            else if (startCodonSplit) {     // at second exon that contains part of start codon
443                                                    Assert.a(frame==1 || frame==2);
444                                                    writeGFTLine(fout,seqName,"start_codon",exonStart+offset,exonStart+offset+frame-1,strand,frame,genePrefix,geneNum);                                             
445                                                    startCodonSplit = false;
446                                            }
447                                            writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,strand,frame,genePrefix,geneNum);
448                                    }
449                                    else if (inMinusExon && (ref == 10 || ref == 11 || ref == 12))  { // just ended an exon on minus strand, now in a minus intron
450                                            strand = "-";
451                                            inMinusExon = false;
452                                            firstExon = false;
453                                            exonEnd = i;
454                                            writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,strand,frame,genePrefix,geneNum);
455                                    }
456                                    else                                                            // now in intergenic region
457                                    {
458                                            boolean write = true;
459                                            if (inPlusExon)                                 // was in gene at previous nucleotide
460                                            {
461                                                    strand = "+";
462                                                    exonEnd = i;                    
463                                                    if (firstExon) {
464                                                            if (exonEnd - exonStart + 1 < 3) {
465                                                                    System.err.println("Single '" + strand + "' strand exon is < 3 bases for sequence '" + seqName + "'.  exonStart=" + exonStart + "  exonEnd=" + exonEnd);
466                                                                    write = false;
467                                                            }
468                                                            else {
469                                                                    writeGFTLine(fout,seqName,"start_codon",exonStart+offset,exonStart+offset+2,strand,frame,genePrefix,geneNum);
470                                                            }
471                                                    }
472                                                    else if (startCodonSplit) {     // at second exon that contains part of start codon
473                                                            Assert.a(frame==1 || frame==2);
474                                                            writeGFTLine(fout,seqName,"start_codon",exonStart+offset,exonStart+offset+frame-1,strand,frame,genePrefix,geneNum);                                             
475                                                    }
476                                                    if (write) {
477                                                            writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,  strand,frame,genePrefix,geneNum);
478                                                            writeGFTLine(fout,seqName,"stop_codon",exonEnd+offset+1,exonEnd+offset+3,strand,0,    genePrefix,geneNum);
479                                                    }
480                                                    inPlusExon  = false;
481                                                    firstExon   = true;
482                                                    startCodonSplit = false;
483                                                    geneNum++;
484                                            }
485                                            else if (inMinusExon) {
486                                                    strand = "-";
487                                                    long prevExonEnd = exonEnd;
488                                                    exonEnd = i;                    
489                                                    if (firstExon && exonEnd - exonStart + 1 < 3) {
490                                                            System.err.println("Single '" + strand + "' strand exon is < 3 bases for sequence '" + seqName + "'.  exonStart=" + exonStart + "  exonEnd=" + exonEnd);
491                                                    }
492                                                    else if (exonEnd - exonStart + 1 < 3) {      // this exon is < 3 bases, need to split start codon
493                                                            if (exonEnd - exonStart + 1 == 2) { // this exon is 2 bases
494                                                                    writeGFTLine(fout,seqName,"start_codon",prevExonEnd+offset,prevExonEnd+offset,strand,0,genePrefix,geneNum);                             
495                                                                    writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,strand,frame,genePrefix,geneNum);
496                                                                    writeGFTLine(fout,seqName,"start_codon",exonStart+offset,exonEnd+offset,strand,2,genePrefix,geneNum);                           
497                                                            }
498                                                            else if (exonEnd - exonStart + 1 == 1)  { // this exon is 1 base
499                                                                    writeGFTLine(fout,seqName,"start_codon",prevExonEnd+offset-1,prevExonEnd+offset,strand,0,genePrefix,geneNum);                           
500                                                                    writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,strand,frame,genePrefix,geneNum);
501                                                                    writeGFTLine(fout,seqName,"start_codon",exonStart+offset,exonEnd+offset,strand,1,genePrefix,geneNum);                                                                                           
502                                                            }
503                                                    }
504                                                    else {
505                                                            writeGFTLine(fout,seqName,"CDS",exonStart+offset,exonEnd+offset,strand,frame,genePrefix,geneNum);
506                                                            writeGFTLine(fout,seqName,"start_codon",exonEnd+offset-2,exonEnd+offset,strand,0,    genePrefix,geneNum);                               
507                                                    }                       
508                                                    inMinusExon = false;
509                                                    firstExon   = true;
510                                                    startCodonSplit = false;
511                                                    geneNum++;                      
512                                            }
513                                    }
514                            }
515                            seqCount++;
516                    }
517                    fout.close();
518            }
519            
520            private void parseSeqName(TrainingSequence seq, int seqNum) {
521                    NameInputSequence nameInput = null;
522    
523                    InputSequence<?> inputSeq = seq.getInputSequence();
524                    if(inputSeq instanceof InputSequenceComposite) {
525                            nameInput = (NameInputSequence) inputSeq.getComponent("name");
526                            
527                    }
528                    if(nameInput == null) {
529                            log.debug("Sequence name not specified.  Setting sequence name to 'SEQ_" + String.valueOf(seqNum) + "'");
530                            seqName    = "SEQ_" + String.valueOf(seqNum);   // Create a name and return.
531                            genePrefix = "SEQ_" + String.valueOf(seqNum);
532                            offset = 0;
533                            return;
534                    }
535                    String name =  nameInput.getName().trim();
536                    
537                    int colon1, colon2, numColons;
538                    
539                    if (name.startsWith("group:") || name.startsWith("seq:") ) {
540                            numColons = numOccurrences(name, ':');
541                            if (numColons == 1) {
542                                    colon1 = name.indexOf(":");
543                                    seqName = name;
544                                    genePrefix = name.substring(colon1 + 1, name.length());
545                                    offset = 0;
546                                    return;
547                            }
548                            else if (numColons == 2) {
549                                    colon1 = name.indexOf(":");
550                                    colon2 = name.lastIndexOf(":");
551                                    seqName = name.substring(0, colon2);
552                                    genePrefix = name.substring(colon1 + 1, colon2);
553                                    int pound = genePrefix.indexOf("#");
554                                    if (pound > 0) {
555                                            genePrefix = genePrefix.substring(0, pound);
556                                    }
557                                    setOffset(name.substring(colon2+1, name.length()));
558                                    return;
559                            }
560                    }
561                    log.debug("Sequence name is in unexpected format.  Setting offset=0 and sequence name='" + name + "'.");
562                    seqName    = name;
563                    genePrefix = name;
564                    offset = 0;
565            }
566            
567            // Returns the number of times the character 'c' occurs in 'str'
568            private static int numOccurrences(String str, char c) {
569                    int num = 0;
570                    int index = str.indexOf(c);
571                    while (index != -1) {
572                            num++;
573                            index = str.indexOf(c, index+1);
574                    }
575                    return num;
576            }
577    
578            private void setOffset(String str) {
579                    int numDashes, dash;
580                    numDashes = numOccurrences(str, '-');
581                    
582                    if (numDashes == 0) {
583                            offset = 0;
584                    }
585                    else if (numDashes == 1) {
586                            try {
587                                    dash = str.indexOf("-");
588                                    offset = Long.valueOf(str.substring(0, dash)) - 1;
589                            }
590                            catch (NumberFormatException e) {
591                                    System.err.println("Sequence range values in unexpected format.  Setting offset=0 for sequence='" + seqName + "'.");
592                                    offset = 0;
593                            }
594                    }
595                    else {
596                            System.err.println("Sequence range values in unexpected format.  Setting offset=0 for sequence='" + seqName + "'.");
597                            offset = 0;
598                    }
599            }
600            
601            // Frame is the nmber of bases in this region befor you get in frame.  
602            // That is, if frame is 0, the first three bases in this element are a codon.
603            // If frame is 1, the first base is the end of a codon hanging over from the 
604            //     end of the previous codon and the next three are the first codon in this feature.
605            // If frame is 2, the first two bases are the end of the previous codon and the 
606            //     next three are the first codon in this feature.
607            private static int setFrame(int ref) {
608                    int frame = -1;
609                    
610                    switch (ref) {
611                    case 1:  frame = 0;  break;
612                    case 2:  frame = 2;  break;
613                    case 3:  frame = 1;  break;
614                    case 7:  frame = 1;  break;
615                    case 8:  frame = 2;  break;
616                    case 9:  frame = 0;  break;
617                    default:  Assert.a(false, "Error setting frame, ref = ", ref);
618                    }
619                    return frame;
620            }
621    
622            // Outputs one line to the GTF file.  
623            // NOTE:  source is assumed to be 'CONRAD', and score is assumed to be unknown and set to '.'.
624            private static void writeGFTLine(Writer out, String seqName, String feature, long exonStart, long exonEnd, 
625                                                                             String strand, int frame, String genePrefix, int geneNum) throws IOException {
626    
627                    Assert.a(frame==0 || frame==1 || frame==2, "Frame value invalid, frame = ", frame);
628                    
629                    String geneId = genePrefix + "G_" + String.valueOf(geneNum);
630                    String transId = genePrefix + "T_" + String.valueOf(geneNum) + ".1";
631                    
632                    out.write(seqName + "\t" + "CONRAD" + "\t" + feature + "\t" + exonStart + "\t" + exonEnd + "\t" +
633                                      "." + "\t" + strand + "\t" + frame + "\t" + 
634                                      "gene_id \"" + geneId + "\"; transcript_id \"" + transId + "\";\n");  
635            }
636    
637            public double[] getViterbiScores() {
638                    return viterbiScores;
639            }
640    
641            /**
642             * @return Returns the writeTrainingData.
643             */
644            public boolean isWriteTrainingData() {
645                    return writeTrainingData;
646            }
647    
648            /**
649             * @param writeTrainingData The writeTrainingData to set.
650             */
651            public void setWriteTrainingData(boolean writeTrainingData) {
652                    this.writeTrainingData = writeTrainingData;
653            }
654    
655    }