001    package calhoun.analysis.crf.io;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileWriter;
005    import java.io.IOException;
006    import java.io.Writer;
007    import java.util.ArrayList;
008    import java.util.HashMap;
009    import java.util.List;
010    
011    import calhoun.util.Assert;
012    
013    /** used for converting between different encodings of gene structure in a hidden sequence.  Mostly for legacy use
014    */
015    public class SequenceConverter {
016    
017            private static HashMap<String, Integer> map = new HashMap<String, Integer>();
018    
019            
020            public static ArrayList<ArrayList<Integer>> stateVector2StateLengths(List<? extends TrainingSequence<?>> data, int nStates) {
021    
022                    ArrayList<ArrayList<Integer>> durations = new ArrayList<ArrayList<Integer>>();
023                    
024                    for (int j=0; j<nStates; j++) {
025                            durations.add(new ArrayList<Integer>());
026                    }
027                    
028                    for (TrainingSequence<?> seq : data) {
029                            if (seq.length()==0) { continue; }
030                            int oldState = seq.getY(0);
031                            int intervalStart = 0;
032                            for (int pos=1; pos<seq.length(); pos++) {
033                                    int newState = seq.getY(pos);
034                                    if (newState != oldState) {
035                                            durations.get(oldState).add(pos-intervalStart);
036                                            intervalStart = pos;
037                                            oldState = newState;
038                                    }
039                            }
040                            durations.get(oldState).add(seq.length()-intervalStart);
041                    }
042                    
043                    return durations;
044            }
045            
046            
047            public static int[] convertSeqFromInterval13ToInterval29(int[] states) {
048                    int swap, preswap;
049                    int ctr = 0;
050                    for(int i=0; i<states.length-1; i++) {
051                            swap = 0; preswap = 0;
052                            // ig-e
053                            if (states[i]==0 && states[i+1]>=1 && states[i+1]<=3) {
054                                    preswap = 13;
055                            // e-ig
056                            } else if (states[i]>=1 && states[i]<=3 && states[i+1]==0) {
057                                    swap = 14;
058                            // e-i_I
059                            } else if (states[i]>=1 && states[i]<=3 && states[i+1]>=4 && states[i+1]<=6) {
060                                    swap = 15 + (states[i+1] - 4);
061                            // i-e_E
062                            } else if (states[i]>=4 && states[i]<=6 && states[i+1]>=1 && states[i+1]<=3) {
063                                    preswap = 18 + (states[i+1] - 1);
064                            // ig-em
065                            } else if (states[i]==0 && states[i+1]>=7 && states[i+1]<=9) {
066                                    preswap = 21;
067                            // em-ig
068                            } else if (states[i]>=7 && states[i]<=9 && states[i+1]==0) {
069                                    swap = 22;                                                              
070                            // em-i_Im
071                            } else if (states[i]>=7 && states[i]<=9 && states[i+1]>=10 && states[i+1]<=12) {
072                                    swap = 23 + (states[i+1] - 10);
073                            // im-e_Em
074                            } else if (states[i]>=10 && states[i]<=12 && states[i+1]>=7 && states[i+1]<=9) {
075                                    preswap = 26 + (states[i+1] - 7);
076                            }
077                            if (swap != 0) {
078                                    if (i != states.length-2 && states[i+2] == states[i+1]) {
079                                            states[i+1] = swap;
080                                            states[i+2] = swap;
081                                    } else if (i == states.length-2) {
082                                            states[i+1] = swap;
083                                    }
084    
085                                    ctr++;
086                            } else if (preswap != 0) {
087                                    if (i >= 1 && states[i] == states[i-1]) {
088                                            states[i] = preswap;
089                                            states[i-1] = preswap;
090                                    } else if (i == 0) {
091                                            states[i] = preswap;
092                                    }
093                            }
094                    }
095                    //log.debug("made " + ctr + " changes, where length is " + states.length);
096                    return states;
097            }
098            
099            public static int[] convertSeqFromInterval29ToInterval13(int[] seq) {
100                    int len = seq.length;
101                    Assert.a(len>=1);
102                    int prevInterval29y = -1;
103                    for (int pos = 0; pos<len; pos++) {
104                            int interval29y = seq[pos];
105                            int interval13y = -1;
106                            if (interval29y == 13 || interval29y == 14 || interval29y == 21 || interval29y == 22) {
107                                    interval13y = 0;
108                            } else if (interval29y >= 15 && interval29y <= 17) {
109                                    interval13y = (interval29y - 15) + 4;
110                            } else if (interval29y >= 23 && interval29y <= 25) {
111                                    interval13y = (interval29y - 23) + 10;
112                            } else if (interval29y >= 18 && interval29y <= 20) {
113                                    Assert.a(prevInterval29y >= 4 && prevInterval29y <= 6);
114                                    interval13y = prevInterval29y;
115                            } else if (interval29y >= 26 && interval29y <= 28) {
116                                    if (prevInterval29y == -1) {
117                                            // XXX: hack
118                                            prevInterval29y = interval29y - 26 + 10;
119                                    }
120                                    Assert.a(prevInterval29y >= 10 && prevInterval29y <= 12, "prev is " + prevInterval29y);
121                                    interval13y = prevInterval29y;
122                            } else {
123                                    interval13y = interval29y;
124                            }
125                            Assert.a(interval13y != -1);
126                            Assert.a(interval13y < 13);
127                            seq[pos] = interval13y;
128                            if (interval29y < 18 || (interval29y > 20 && interval29y < 26)) {
129                                    prevInterval29y = interval29y;
130                            }
131                    }       
132                    return seq;
133            }
134            
135            public static int[] convertSeqFromInterval29ToInterval13Wrong(int[] states) {
136                    int swap;
137                    for(int i=0; i<states.length; i++) {
138                            swap = states[i];
139                            if (states[i] == 13 || states[i] == 14 || states[i] == 21 || states[i] == 22) {
140                                    swap = 0;
141                            } else if (states[i] >= 15 && states[i] <= 17) {
142                                    swap = states[i] - 15 + 4;
143                            // Likely wrong
144                            } else if (states[i] >= 18 && states[i] <= 20) {
145                                    swap = states[i] - 18 + 4;
146                            } else if (states[i] >= 23 && states[i] <= 25) {
147                                    swap = states[i] - 23 + 10;
148                            // Also likely wrong
149                            } else if (states[i] >= 26 && states[i] <= 28) {
150                                    swap = states[i] - 26 + 10;
151                            }
152                            
153                            Assert.a(swap >= 0 && swap <= 12);
154                            states[i] = swap;
155                    }
156                    return states;
157            }
158            
159            
160            public static void convertSeqFromTricycle13ToInterval13(TrainingSequence<Character> seq) {
161                    int len = seq.length();
162                    Assert.a(len>=1);
163                    
164                    for (int pos = 0; pos<len; pos++) {
165                            int tricycle13y = seq.getY(pos);
166                            int interval13y = posTricycle2interval13(pos,tricycle13y);
167                            seq.setY(pos,interval13y);
168                    }
169            }
170    
171            public static String convertSeqFromTricycle13ToInterval13(String seq2) {
172                    char[] seq = seq2.toCharArray();
173                    int len = seq.length;
174                    Assert.a(len>=1);
175                    
176                    for (int pos = 0; pos<len; pos++) {
177                            int tricycle13y = char2integer13(seq[pos]);
178                            int interval13y = posTricycle2interval13(pos,tricycle13y);
179                            seq[pos] = integer132char(interval13y);
180                    }
181                    return new String(seq);
182            }
183            
184            private static char integer132char(int i) {
185                    if (i<10) {
186                            return (char) ('0'+i);
187                    } else if (i<36) {
188                            return (char) ('A'+(i-10));
189                    } else if (i<62) {
190                            return (char) ('a'+(i-36));
191                    }
192                    Assert.a(false);
193                    return 0;
194            }
195    
196    
197            private static int char2integer13(char x) {             
198                    int temp = x - '0';
199                    if ( (temp<0) || (temp>9)) {
200                            temp = x - 'A' + 10;                    
201                            if ( (temp<10) || (temp>35)) {
202                                    temp = x - 'a' + 36;
203                                    Assert.a( (temp>=36) && (temp<62), "Offending character was '" + x);
204                            }
205                    }
206                    Assert.a(temp<13,"temp = " + temp + "  and x = " + x);
207                    return temp;
208            }
209            
210            private static int posTricycle2interval13(int pos, int tricycle13y) {
211                    int interval13y = 0;
212                    switch(tricycle13y) {
213                    case 0:
214                            // intergenic
215                            interval13y = 0;
216                            break;
217                    case 1: 
218                    case 2:
219                    case 3:
220                            // exon plus strand
221                            interval13y = ((( pos - (tricycle13y - 1) ) %3 +3) %3) + 1;
222                            break;
223                    case 4:
224                            // intron plus strand
225                            interval13y = 6;
226                            break;
227                    case 5:
228                            interval13y = 5;
229                            break;
230                    case 6:
231                            interval13y = 4;
232                            break;
233                    case 7:
234                    case 8:
235                    case 9:
236                            // exon minus strand
237                            interval13y = (pos + (tricycle13y - 7) + 1) % 3 + 7;
238                            break;
239                    case 10:
240                            // intron minus strand
241                            interval13y = 12;
242                            break;
243                    case 11:
244                            interval13y = 11;
245                            break;
246                    case 12:
247                            interval13y = 10;
248                            break;
249                    default:
250                            Assert.a(false);
251                    }
252                    return interval13y;
253            }
254    
255    
256            // NOTE: it just so happens that the conversion Interval13ToTricycle13 is it's own inverse,
257            // but this is just a coincidence and best programming prcatice to write it out twice.
258            public static void convertSeqFromInterval13ToTricycle13(TrainingSequence<Character> seq) {
259                    int len = seq.length();
260                    Assert.a(len>=1);
261                    
262                    for (int pos = 0; pos<len; pos++) {
263                            int interval13y = seq.getY(pos);
264                            int tricycle13y = posInterval2tricycle13(pos,interval13y);
265                            seq.setY(pos,tricycle13y);
266                    }       
267            }       
268            
269            public static String convertSeqFromInterval13ToTricycle13(String seq2) {
270                    char[] seq = seq2.toCharArray();
271                    int len = seq.length;
272                    Assert.a(len>=1);
273                    
274                    for (int pos = 0; pos<len; pos++) {
275                            int interval13y = char2integer13(seq[pos]);
276                            int tricycle13y = posInterval2tricycle13(pos,interval13y);
277                            seq[pos] = integer132char(tricycle13y);
278                    }       
279                    return new String(seq);
280            }               
281            
282            
283            private static int posInterval2tricycle13(int pos, int interval13y) {
284                    int tricycle13y = 0;
285                    switch(interval13y) {
286                    case 0:
287                            // intergenic
288                            tricycle13y = 0;
289                            break;
290                    case 1: 
291                    case 2:
292                    case 3:
293                            // exon plus strand
294                            tricycle13y = ((( pos - (interval13y - 1) ) %3 +3) %3) + 1;
295                            break;
296                    case 4:
297                            // intron plus strand
298                            tricycle13y = 6;
299                            break;
300                    case 5:
301                            tricycle13y = 5;
302                            break;
303                    case 6:
304                            tricycle13y = 4;
305                            break;
306                    case 7:
307                    case 8:
308                    case 9:
309                            // exon minus strand
310                            tricycle13y = (((-pos + (interval13y - 7) + 2) %3 +3) %3) + 7;
311                            break;
312                    case 10:
313                            // intron minus strand
314                            tricycle13y = 12;
315                            break;
316                    case 11:
317                            tricycle13y = 11;
318                            break;
319                    case 12:
320                            tricycle13y = 10;
321                            break;
322                    default:
323                            Assert.a(false);
324                    }
325                    return tricycle13y;
326            }
327    
328    
329    
330    
331    
332    
333            // Given a hidden sequence that has 13 states and values [0, 12], converts that
334            // sequence to a 39 state model with values [0, 38].
335            public static void convertSeqFrom13To39(TrainingSequence<Character> seq)
336            {
337                    setStateMap();
338                    if (seq.length() < 2)        return;
339                    
340                    ArrayList<SeqPair> states = new ArrayList<SeqPair>();
341                    int i, state39, total, k, seqIdx, curIdx;
342                    int seqLen = seq.length();
343                    int startElement = seq.getY(0);
344                    int startIndex   = 0;
345    
346                    int prevElement = seq.getY(0);
347                    int curElement  = seq.getY(1);
348                    int nextElement;
349                    int prevState   = -1;
350                    
351                    for (i=2; i<seqLen; i++)
352                    {               
353                            curIdx = i-1;
354                            
355                            nextElement = seq.getY(i);
356                            Assert.a(curElement>=0 && curElement <=12, "invalid character in hidden sequence, '", curElement, "'");                           
357                            
358                            if (!sameState(startElement, curElement))
359                            {
360                                    // End the previous state
361                                    state39 = getState39(startElement, prevElement, curElement, prevState);
362                                    states.add(new SeqPair(state39, (curIdx-1)-startIndex + 1));
363                                    prevState = state39;
364                                    
365                                    // Start the current state
366                                    startElement = curElement;
367                                    startIndex   = curIdx;
368                            }
369                            prevElement = curElement;
370                            curElement  = nextElement;
371                    }
372    
373                    // Add last state
374                    state39 = getState39(startElement, prevElement, -1, prevState);
375                    states.add(new SeqPair(state39, (i-1)-startIndex+1));
376    
377                    // Verify sequence lengths will be the same
378                    total = 0;
379                    for (i=0; i<states.size(); i++)
380                            total += states.get(i).length;
381                    Assert.a(total == seqLen, "Sum of state lengths = " + total + ", Sequence Length = " + seqLen);
382                    
383                    // Set the values in the sequence to be in 39 state model.
384                    seqIdx = 0;
385                    for (i=0; i<states.size(); i++)
386                    {
387                            for (k=0; k<states.get(i).length; k++)
388                            {
389                                    seq.setY(seqIdx, states.get(i).state);
390                                    seqIdx++;
391                            }
392                    }
393            }
394    
395            
396            // Given a hidden sequence that has 39 states and values [0, 38], converts that
397            // sequence to a 13 state model with values [0, 12].
398            public static void convertSeqFrom39To13(TrainingSequence<Character> seq)
399            {
400                    int len = seq.length();
401                    if (len < 1) {return;}
402                    
403                    int[] y = new int[len];
404                    
405                    for (int j=0; j<len; j++) {
406                            y[j] = seq.getY(j); }
407                    
408                    convertSeqFrom39To13(y);
409                    
410                    for (int j=0; j<len; j++) {
411                            seq.setY(j,y[j]); }
412                    
413            }
414            
415            // Given a hidden sequence that has 39 states and values [0, 38], converts that
416            // sequence to a 13 state model with values [0, 12].
417            public static void convertSeqFrom39To13(int[] seq)
418            {
419                    if (seq.length < 1)  {return;}
420                    
421                    int seqLen = seq.length;
422                    int cur, i, exonPhase;
423                    boolean inExon = false;
424                    exonPhase = -1;
425                    
426                    cur = seq[0];
427                    for (i=1; i<seqLen; i++)
428                    {               
429                            cur = seq[i];
430                            Assert.a(cur>=0 && cur <=38, "invalid character in hidden sequence, '", cur, "'");                
431    
432                            if (cur==0)                                     // INTERGENIC, do nothing, 0 in both models
433                            {
434                                    inExon = false;
435                            }
436                            else if (isIntron39(cur))       // INTRON
437                            {
438                                    seq[i] = convertIntron39To13(cur);
439                                    inExon = false;
440                            }
441                            else                                                            // EXON
442                            {
443                                    if (inExon)             // already been here, just keep cycling through 1, 2, 3, or 9, 8, 7
444                                    {
445                                            seq[i] = exonPhase;
446                                            exonPhase = incrementExonPhase(exonPhase);
447                                    }
448                                    else                    // first time we're entering an exon
449                                    {
450                                            inExon = true;
451                                            exonPhase = convertExon39To13(cur);
452                                            seq[i] = exonPhase;
453                                            exonPhase = incrementExonPhase(exonPhase);
454                                    }
455                            }
456                    }
457            }       
458            
459            
460            //
461            // SUPPORTING FUNCTIONS FOR 13 -> 39 CONVERSION
462            //
463            
464            // Given a start and end state in the 13 state model, returns the state in the 39 state model.
465            private static int getState39(int start, int end, int next, int prevState)
466            {
467                    int state39 = -1;
468                    
469                    if      (start ==  0 && end ==  0)      state39 = map.get("NTG").intValue();
470                    else if (start ==  6 && end ==  6)      state39 = map.get("I0p").intValue();
471                    else if (start ==  4 && end ==  4)      state39 = map.get("I1p").intValue();
472                    else if (start ==  5 && end ==  5)      state39 = map.get("I2p").intValue();
473                    else if (start == 12 && end == 12)      state39 = map.get("I0m").intValue();
474                    else if (start == 10 && end == 10)      state39 = map.get("I1m").intValue();
475                    else if (start == 11 && end == 11)      state39 = map.get("I2m").intValue();
476    
477                    else if (start ==  1 && end ==  3)      
478                    {
479                            if (prevState == map.get("NTG").intValue() && next == 0)        state39 = map.get("ENNp").intValue();
480                            else if (prevState == map.get("NTG").intValue())                        state39 = map.get("EN0p").intValue();
481                            else if (next == 0)                                                                                     state39 = map.get("E0Np").intValue();
482                            else                                                                                                            state39 = map.get("E00p").intValue();
483                    }
484                    else if (start ==  1 && end ==  1)      
485                    {
486                            if (prevState == map.get("NTG").intValue())             state39 = map.get("EN1p").intValue();
487                            else                                                                                    state39 = map.get("E01p").intValue();
488                    }
489                    else if (start ==  1 && end ==  2)      
490                    {
491                            if (prevState == map.get("NTG").intValue())             state39 = map.get("EN2p").intValue();
492                            else                                                                                    state39 = map.get("E02p").intValue();
493                    }
494                    else if (start ==  2 && end ==  3)      
495                    {
496                            if (next == 0)                                                                  state39 = map.get("E1Np").intValue();
497                            else                                                                                    state39 = map.get("E10p").intValue();
498                    }
499                    else if (start ==  3 && end ==  3)      
500                    {
501                            if (next == 0)                                                                  state39 = map.get("E2Np").intValue();
502                            else                                                                                    state39 = map.get("E20p").intValue();
503                    }
504                    else if (start ==  2 && end ==  1)      state39 = map.get("E11p").intValue();
505                    else if (start ==  2 && end ==  2)      state39 = map.get("E12p").intValue();
506                    else if (start ==  3 && end ==  1)      state39 = map.get("E21p").intValue();
507                    else if (start ==  3 && end ==  2)      state39 = map.get("E22p").intValue();
508                    
509                    else if (start ==  9 && end ==  7)      
510                    {
511                            if (prevState == map.get("NTG").intValue() && next == 0)        state39 = map.get("ENNm").intValue();
512                            else if (prevState == map.get("NTG").intValue())                        state39 = map.get("E0Nm").intValue();
513                            else if (next == 0)                                                                                     state39 = map.get("EN0m").intValue();
514                            else                                                                                                            state39 = map.get("E00m").intValue();
515                    }
516                    else if (start ==  7 && end ==  7)      
517                    {
518                            if (next == 0)                                                                  state39 = map.get("EN1m").intValue();
519                            else                                                                                    state39 = map.get("E01m").intValue();
520                    }
521                    else if (start ==  8 && end ==  7)      
522                    {
523                            if (next == 0)                                                                  state39 = map.get("EN2m").intValue();
524                            else                                                                                    state39 = map.get("E02m").intValue();
525                    }
526                    else if (start ==  9 && end ==  8)      
527                    {
528                            if (prevState == map.get("NTG").intValue())             state39 = map.get("E1Nm").intValue();
529                            else                                                                                    state39 = map.get("E10m").intValue();
530                    }
531                    else if (start ==  9 && end ==  9)      
532                    {
533                            if (prevState == map.get("NTG").intValue())             state39 = map.get("E2Nm").intValue();
534                            else                                                                                    state39 = map.get("E20m").intValue();
535                    }
536                    else if (start ==  7 && end ==  8)      state39 = map.get("E11m").intValue();
537                    else if (start ==  8 && end ==  8)      state39 = map.get("E12m").intValue();
538                    else if (start ==  7 && end ==  9)      state39 = map.get("E21m").intValue();
539                    else if (start ==  8 && end ==  9)      state39 = map.get("E22m").intValue();
540                    
541                    if (state39 == -1)      Assert.a(false, "start = " + start + "   end = " + end);
542                    return state39;
543            }
544            
545            // Given a two states in the 13 state model, returns true if they are the same state,
546            // else returns false.  I.e. 1,2,3 are all positive exons and considered the same state.
547            private static boolean sameState(int state1, int state2)
548            {
549                    if (state1 == state2)
550                            return true;
551                    else if (isPlusExon(state1) && isPlusExon(state2))
552                            return true;
553                    else if (isMinusExon(state1) && isMinusExon(state2))
554                            return true;
555                    return false;
556            }
557            
558            // Given a state in the 13 state model, returns true if it is an exon on the plus strand.
559            private static boolean isPlusExon(int state)
560            {
561                    if (state == 1 || state == 2 || state == 3)
562                            return true;
563                    return false;
564            }
565            
566            // Given a state in the 13 state model, returns true if it is an exon on the minus strand.
567            private static boolean isMinusExon(int state)
568            {
569                    if (state == 9 || state == 8 || state == 7)
570                            return true;
571                    return false;
572            }
573            
574            //
575            // SUPPORTING FUNCTIONS FOR 39 -> 13 CONVERSION
576            //
577            
578            // Given an INTRON state in the 39 model state, returns the intron state in the 13 model state.
579            private static int convertIntron39To13(int element)
580            {       
581                    if (element == map.get("I1p").intValue())       return 4;       // intron1
582                    if (element == map.get("I2p").intValue())       return 5;       // intron2
583                    if (element == map.get("I0p").intValue())       return 6;       // intron3
584                    if (element == map.get("I1m").intValue())       return 10;      // intron1m
585                    if (element == map.get("I2m").intValue())       return 11;      // intron2m
586                    if (element == map.get("I0m").intValue())       return 12;      // intron3m
587                    
588                    Assert.a(false);
589                    return -1;
590            }
591            
592            // Given an EXON state in the 39 model state, returns the intron state in the 13 model state.
593            private static int convertExon39To13(int element)
594            {
595                    int exonPhase = -1;
596                    
597                    if (getStrand39(element) == +1)         // plus strand
598                    {
599                            if      (element == map.get("ENNp").intValue())         exonPhase = 1;  // exon1
600                            else if (element == map.get("EN0p").intValue())         exonPhase = 1;  // exon1
601                            else if (element == map.get("EN1p").intValue())         exonPhase = 1;  // exon1
602                            else if (element == map.get("EN2p").intValue())         exonPhase = 1;  // exon1
603                            else if (element == map.get("E00p").intValue())         exonPhase = 1;  // exon1
604                            else if (element == map.get("E01p").intValue())         exonPhase = 1;  // exon1
605                            else if (element == map.get("E02p").intValue())         exonPhase = 1;  // exon1
606                            else if (element == map.get("E10p").intValue())         exonPhase = 2;  // exon2
607                            else if (element == map.get("E11p").intValue())         exonPhase = 2;  // exon2
608                            else if (element == map.get("E12p").intValue())         exonPhase = 2;  // exon2
609                            else if (element == map.get("E20p").intValue())         exonPhase = 3;  // exon3
610                            else if (element == map.get("E21p").intValue())         exonPhase = 3;  // exon3
611                            else if (element == map.get("E22p").intValue())         exonPhase = 3;  // exon3
612                            else if (element == map.get("E0Np").intValue())         exonPhase = 1;  // exon1
613                            else if (element == map.get("E1Np").intValue())         exonPhase = 2;  // exon2
614                            else if (element == map.get("E2Np").intValue())         exonPhase = 3;  // exon3
615                    }
616                    else if (getStrand39(element) == -1)    // minus strand
617                    {
618                            if      (element == map.get("ENNm").intValue())         exonPhase = 9;  // exon3m
619                            else if (element == map.get("EN0m").intValue())         exonPhase = 9;  // exon3m
620                            else if (element == map.get("EN1m").intValue())         exonPhase = 7;  // exon1m
621                            else if (element == map.get("EN2m").intValue())         exonPhase = 8;  // exon2m
622                            else if (element == map.get("E00m").intValue())         exonPhase = 9;  // exon3m
623                            else if (element == map.get("E01m").intValue())         exonPhase = 7;  // exon1m
624                            else if (element == map.get("E02m").intValue())         exonPhase = 8;  // exon2m
625                            else if (element == map.get("E10m").intValue())         exonPhase = 9;  // exon3m
626                            else if (element == map.get("E11m").intValue())         exonPhase = 7;  // exon1m
627                            else if (element == map.get("E12m").intValue())         exonPhase = 8;  // exon2m
628                            else if (element == map.get("E20m").intValue())         exonPhase = 9;  // exon3m
629                            else if (element == map.get("E21m").intValue())         exonPhase = 7;  // exon1m
630                            else if (element == map.get("E22m").intValue())         exonPhase = 8;  // exon2m
631                            else if (element == map.get("E0Nm").intValue())         exonPhase = 9;  // exon3m
632                            else if (element == map.get("E1Nm").intValue())         exonPhase = 9;  // exon3m
633                            else if (element == map.get("E2Nm").intValue())         exonPhase = 9;  // exon3m
634                    }
635                    if (exonPhase == -1)    Assert.a(false);
636                    return exonPhase;
637            }
638            
639            // Given an exon in the 39 state model, returns the strand (+1 or -1)
640            private static int getStrand39(int element)
641            {
642                    if (7 <= element && element <= 22)
643                            return (+1);
644                    else if (23 <= element && element <= 38)
645                            return (-1);
646                    
647                    Assert.a(false);
648                    return 0;
649            }
650            
651            // Given an element in the 39 model state, returns true if this element
652            //   is an intron, else returns false
653            private static boolean isIntron39(int element)
654            {
655                    if (1 <= element && element <= 6)
656                            return true;
657                    else
658                            return false;
659            }
660            
661            private static int incrementExonPhase(int phase)
662            {
663                    if (phase == 1 || phase == 2 || phase == 3)
664                    {
665                            phase++;
666                            if (phase == 4) phase = 1;
667                    }
668                    else if (phase == 9 || phase == 8 || phase == 7)
669                    {
670                            phase--;
671                            if (phase == 6) phase = 9;
672                    }
673                    else
674                    {
675                            Assert.a(false);
676                    }
677                    return phase;
678            }
679            
680            // 
681            // CONVERT A HIDDEN SEQUENCE TO A GFF FILE
682            // 
683            
684            // This function converts a 13 state model hidden sequence to a GFF file.  
685            // Used for debugging only.
686            public static void writeHiddenSequenceGFF(TrainingSequence<Character> refStates, String filename) throws IOException
687            {
688                    int i, ref, exonStart, exonEnd, geneNum;
689                    boolean inPlusGene = false;
690                    boolean inMinusGene = false;
691                    geneNum = 0;
692                    exonStart = exonEnd = geneNum = 0;
693                    Writer fout = new BufferedWriter(new FileWriter(filename));     
694    
695                    for (i=0; i<refStates.length(); i++)
696                    {
697                            ref = refStates.getY(i);
698                            
699                            if (ref == 1 || ref == 2 || ref == 3)
700                            {
701                                    if (!inPlusGene)
702                                    {
703                                            exonStart = i+1;
704                                            inPlusGene = true;
705                                    }
706                            }
707                            else if (ref == 7 || ref == 8 || ref == 9)
708                            {
709                                    if (!inMinusGene)
710                                    {
711                                            exonStart = i+1;
712                                            inMinusGene = true;
713                                    }
714                            }
715                            else if (inPlusGene && (ref == 4 || ref == 5 || ref == 6) )
716                            {
717                                    exonEnd = i;
718                                    fout.write("XXX\tghmm\tENNp\t" + exonStart + "\t" + exonEnd + "\t0\t+\t0\tgene_" + geneNum + "\n");
719                                    inPlusGene = false;
720                            }
721                            else if (inMinusGene && (ref == 10 || ref == 11 || ref == 12) )
722                            {
723                                    exonEnd = i;
724                                    fout.write("XXX\tghmm\tENNm\t" + exonStart + "\t" + exonEnd + "\t0\t-\t0\tgene_" + geneNum + "\n");
725                                    inMinusGene = false;
726                            }
727                            else
728                            {
729                                    if (inPlusGene || inMinusGene)  // was in gene at last nucleotide, but not in any more
730                                    {
731                                            exonEnd = i;
732                                            
733                                            if (inPlusGene)
734                                                    fout.write("XXX\tghmm\tENNp\t" + exonStart + "\t" + exonEnd + "\t0\t+\t0\tgene_" + geneNum + "\n");
735                                            else 
736                                                    fout.write("XXX\tghmm\tENNm\t" + exonStart + "\t" + exonEnd + "\t0\t-\t0\tgene_" + geneNum + "\n");
737                                            inPlusGene = false;
738                                            inMinusGene = false;
739                                            geneNum++;
740                                    }
741                            }
742                            
743                    }
744                    fout.close();
745            }
746            
747            // This function converts a 39 state model hidden sequence to a GFF file.  
748            // Used for debugging only.
749            public void writeHiddenSequence39GFF(TrainingSequence<Character> refStates, String filename) throws IOException
750            {
751                    int i, ref, exonStart, exonEnd, geneNum;
752                    boolean inPlusGene = false;
753                    boolean inMinusGene = false;
754                    geneNum = 0;
755                    exonStart = exonEnd = geneNum = 0;
756                    Writer fout = new BufferedWriter(new FileWriter(filename));     
757                    for (i=0; i<refStates.length(); i++)
758                    {
759                            ref = refStates.getY(i);
760                            
761                            if (isPlusExon39(ref))  // plus exon
762                            {
763                                    if (!inPlusGene)
764                                    {
765                                            exonStart = i+1;
766                                            inPlusGene = true;
767                                    }
768                            }
769                            else if (isMinusExon39(ref))    // minus exon
770                            {
771                                    if (!inMinusGene)
772                                    {
773                                            exonStart = i+1;
774                                            inMinusGene = true;
775                                    }
776                            }
777                            else if (inPlusGene && isPlusIntron39(ref) )    // plus intron
778                            {
779                                    exonEnd = i;
780                                    fout.write("XXX\tghmm\t" + stateIdxToString(refStates.getY(exonStart)) + "\t" + exonStart + "\t" + exonEnd + "\t0\t+\t0\tgene_" + geneNum + "\n");
781                                    inPlusGene = false;
782                            }
783                            else if (inMinusGene && isMinusIntron39(ref) )  // minus intron
784                            {
785                                    exonEnd = i;
786                                    fout.write("XXX\tghmm\t" + stateIdxToString(refStates.getY(exonStart)) + "\t" + exonStart + "\t" + exonEnd + "\t0\t-\t0\tgene_" + geneNum + "\n");
787                                    inMinusGene = false;
788                            }
789                            else
790                            {
791                                    if (inPlusGene || inMinusGene)  // was in gene at last nucleotide, but not in any more
792                                    {
793                                            exonEnd = i;
794                                            
795                                            if (inPlusGene)
796                                                    fout.write("XXX\tghmm\t" + stateIdxToString(refStates.getY(exonStart)) + "\t" + exonStart + "\t" + exonEnd + "\t0\t+\t0\tgene_" + geneNum + "\n");
797                                            else 
798                                                    fout.write("XXX\tghmm\t" + stateIdxToString(refStates.getY(exonStart)) + "\t" + exonStart + "\t" + exonEnd + "\t0\t-\t0\tgene_" + geneNum + "\n");
799                                            inPlusGene = false;
800                                            inMinusGene = false;
801                                            geneNum++;
802                                    }
803                            }
804                            
805                    }
806                    fout.close();   
807            }
808            
809            // Returns true if state is a exon in the 39 state model, else returns false
810            private boolean isPlusExon39(int state)
811            {
812                    if (7 <= state && state <= 22)
813                            return true;
814                    return false;
815            }
816            private boolean isMinusExon39(int state)
817            {
818                    if (23 <= state && state <= 38)
819                            return true;
820                    return false;
821            }
822            private boolean isPlusIntron39(int state)
823            {
824                    if (1 <= state && state <= 3)
825                            return true;
826                    return false;
827            }
828            private boolean isMinusIntron39(int state)
829            {
830                    if (4 <= state && state <= 6)
831                            return true;
832                    return false;
833            }       
834            
835            // 
836            // COMMON FUNCTIONS AND STRUCTS
837            //
838            
839            // NOTE:  This info is copied from GHMM.  It likely should not be in two places.
840            private static void setStateMap()
841            {
842                    int statenum = 0;
843                    
844                    map.put("NTG", new Integer(statenum));  statenum++;
845                    map.put("I0p", new Integer(statenum));  statenum++;
846                    map.put("I1p", new Integer(statenum));  statenum++;
847                    map.put("I2p", new Integer(statenum));  statenum++;
848                    map.put("I0m", new Integer(statenum));  statenum++;
849                    map.put("I1m", new Integer(statenum));  statenum++;
850                    map.put("I2m", new Integer(statenum));  statenum++;
851                    map.put("ENNp", new Integer(statenum)); statenum++;
852                    map.put("EN0p", new Integer(statenum)); statenum++;
853                    map.put("EN1p", new Integer(statenum)); statenum++;
854                    map.put("EN2p", new Integer(statenum)); statenum++;
855                    map.put("E00p", new Integer(statenum)); statenum++;
856                    map.put("E01p", new Integer(statenum)); statenum++;
857                    map.put("E02p", new Integer(statenum)); statenum++;
858                    map.put("E10p", new Integer(statenum)); statenum++;
859                    map.put("E11p", new Integer(statenum)); statenum++;
860                    map.put("E12p", new Integer(statenum)); statenum++;
861                    map.put("E20p", new Integer(statenum)); statenum++;
862                    map.put("E21p", new Integer(statenum)); statenum++;
863                    map.put("E22p", new Integer(statenum)); statenum++;
864                    map.put("E0Np", new Integer(statenum)); statenum++;
865                    map.put("E1Np", new Integer(statenum)); statenum++;
866                    map.put("E2Np", new Integer(statenum)); statenum++;
867                    map.put("ENNm", new Integer(statenum)); statenum++;
868                    map.put("EN0m", new Integer(statenum)); statenum++;
869                    map.put("EN1m", new Integer(statenum)); statenum++;
870                    map.put("EN2m", new Integer(statenum)); statenum++;
871                    map.put("E00m", new Integer(statenum)); statenum++;
872                    map.put("E01m", new Integer(statenum)); statenum++;
873                    map.put("E02m", new Integer(statenum)); statenum++;
874                    map.put("E10m", new Integer(statenum)); statenum++;
875                    map.put("E11m", new Integer(statenum)); statenum++;
876                    map.put("E12m", new Integer(statenum)); statenum++;
877                    map.put("E20m", new Integer(statenum)); statenum++;
878                    map.put("E21m", new Integer(statenum)); statenum++;
879                    map.put("E22m", new Integer(statenum)); statenum++;
880                    map.put("E0Nm", new Integer(statenum)); statenum++;
881                    map.put("E1Nm", new Integer(statenum)); statenum++;
882                    map.put("E2Nm", new Integer(statenum)); statenum++;
883            }       
884            
885            private String stateIdxToString(int state)
886            {
887                    switch(state)
888                    {
889                    case 0: return "NTG";
890                    case 1: return "I0p";
891                    case 2: return "I1p";
892                    case 3: return "I2p";
893                    case 4: return "I0m";
894                    case 5: return "I1m";
895                    case 6: return "I2m"; 
896                    case 7: return "ENNp";
897                    case 8: return "EN0p";
898                    case 9: return "EN1p";
899                    case 10: return "EN2p";
900                    case 11: return "E00p";
901                    case 12: return "E01p";
902                    case 13: return "E02p";
903                    case 14: return "E10p";
904                    case 15: return "E11p";
905                    case 16: return "E12p";
906                    case 17: return "E20p";
907                    case 18: return "E21p";
908                    case 19: return "E22p";
909                    case 20: return "E0Np";
910                    case 21: return "E1Np";
911                    case 22: return "E2Np";
912                    case 23: return "ENNm";
913                    case 24: return "EN0m";
914                    case 25: return "EN1m";
915                    case 26: return "EN2m";
916                    case 27: return "E00m";
917                    case 28: return "E01m";
918                    case 29: return "E02m";
919                    case 30: return "E10m";
920                    case 31: return "E11m";
921                    case 32: return "E12m";
922                    case 33: return "E20m";
923                    case 34: return "E21m";
924                    case 35: return "E22m";
925                    case 36: return "E0Nm";
926                    case 37: return "E1Nm";
927                    case 38: return "E2Nm";
928                    }
929                    return "XXX";
930            }
931            
932            private static class SeqPair
933            {
934                    public int state;
935                    public int length;
936                    public SeqPair(int st, int len) {state=st; length=len; }
937            }
938    
939    
940    }