001 package calhoun.analysis.crf.io;
002
003 import java.io.BufferedWriter;
004 import java.io.FileWriter;
005 import java.io.IOException;
006 import java.io.Writer;
007 import java.util.ArrayList;
008 import java.util.HashMap;
009 import java.util.List;
010
011 import calhoun.util.Assert;
012
013 /** used for converting between different encodings of gene structure in a hidden sequence. Mostly for legacy use
014 */
015 public class SequenceConverter {
016
017 private static HashMap<String, Integer> map = new HashMap<String, Integer>();
018
019
020 public static ArrayList<ArrayList<Integer>> stateVector2StateLengths(List<? extends TrainingSequence<?>> data, int nStates) {
021
022 ArrayList<ArrayList<Integer>> durations = new ArrayList<ArrayList<Integer>>();
023
024 for (int j=0; j<nStates; j++) {
025 durations.add(new ArrayList<Integer>());
026 }
027
028 for (TrainingSequence<?> seq : data) {
029 if (seq.length()==0) { continue; }
030 int oldState = seq.getY(0);
031 int intervalStart = 0;
032 for (int pos=1; pos<seq.length(); pos++) {
033 int newState = seq.getY(pos);
034 if (newState != oldState) {
035 durations.get(oldState).add(pos-intervalStart);
036 intervalStart = pos;
037 oldState = newState;
038 }
039 }
040 durations.get(oldState).add(seq.length()-intervalStart);
041 }
042
043 return durations;
044 }
045
046
047 public static int[] convertSeqFromInterval13ToInterval29(int[] states) {
048 int swap, preswap;
049 int ctr = 0;
050 for(int i=0; i<states.length-1; i++) {
051 swap = 0; preswap = 0;
052 // ig-e
053 if (states[i]==0 && states[i+1]>=1 && states[i+1]<=3) {
054 preswap = 13;
055 // e-ig
056 } else if (states[i]>=1 && states[i]<=3 && states[i+1]==0) {
057 swap = 14;
058 // e-i_I
059 } else if (states[i]>=1 && states[i]<=3 && states[i+1]>=4 && states[i+1]<=6) {
060 swap = 15 + (states[i+1] - 4);
061 // i-e_E
062 } else if (states[i]>=4 && states[i]<=6 && states[i+1]>=1 && states[i+1]<=3) {
063 preswap = 18 + (states[i+1] - 1);
064 // ig-em
065 } else if (states[i]==0 && states[i+1]>=7 && states[i+1]<=9) {
066 preswap = 21;
067 // em-ig
068 } else if (states[i]>=7 && states[i]<=9 && states[i+1]==0) {
069 swap = 22;
070 // em-i_Im
071 } else if (states[i]>=7 && states[i]<=9 && states[i+1]>=10 && states[i+1]<=12) {
072 swap = 23 + (states[i+1] - 10);
073 // im-e_Em
074 } else if (states[i]>=10 && states[i]<=12 && states[i+1]>=7 && states[i+1]<=9) {
075 preswap = 26 + (states[i+1] - 7);
076 }
077 if (swap != 0) {
078 if (i != states.length-2 && states[i+2] == states[i+1]) {
079 states[i+1] = swap;
080 states[i+2] = swap;
081 } else if (i == states.length-2) {
082 states[i+1] = swap;
083 }
084
085 ctr++;
086 } else if (preswap != 0) {
087 if (i >= 1 && states[i] == states[i-1]) {
088 states[i] = preswap;
089 states[i-1] = preswap;
090 } else if (i == 0) {
091 states[i] = preswap;
092 }
093 }
094 }
095 //log.debug("made " + ctr + " changes, where length is " + states.length);
096 return states;
097 }
098
099 public static int[] convertSeqFromInterval29ToInterval13(int[] seq) {
100 int len = seq.length;
101 Assert.a(len>=1);
102 int prevInterval29y = -1;
103 for (int pos = 0; pos<len; pos++) {
104 int interval29y = seq[pos];
105 int interval13y = -1;
106 if (interval29y == 13 || interval29y == 14 || interval29y == 21 || interval29y == 22) {
107 interval13y = 0;
108 } else if (interval29y >= 15 && interval29y <= 17) {
109 interval13y = (interval29y - 15) + 4;
110 } else if (interval29y >= 23 && interval29y <= 25) {
111 interval13y = (interval29y - 23) + 10;
112 } else if (interval29y >= 18 && interval29y <= 20) {
113 Assert.a(prevInterval29y >= 4 && prevInterval29y <= 6);
114 interval13y = prevInterval29y;
115 } else if (interval29y >= 26 && interval29y <= 28) {
116 if (prevInterval29y == -1) {
117 // XXX: hack
118 prevInterval29y = interval29y - 26 + 10;
119 }
120 Assert.a(prevInterval29y >= 10 && prevInterval29y <= 12, "prev is " + prevInterval29y);
121 interval13y = prevInterval29y;
122 } else {
123 interval13y = interval29y;
124 }
125 Assert.a(interval13y != -1);
126 Assert.a(interval13y < 13);
127 seq[pos] = interval13y;
128 if (interval29y < 18 || (interval29y > 20 && interval29y < 26)) {
129 prevInterval29y = interval29y;
130 }
131 }
132 return seq;
133 }
134
135 public static int[] convertSeqFromInterval29ToInterval13Wrong(int[] states) {
136 int swap;
137 for(int i=0; i<states.length; i++) {
138 swap = states[i];
139 if (states[i] == 13 || states[i] == 14 || states[i] == 21 || states[i] == 22) {
140 swap = 0;
141 } else if (states[i] >= 15 && states[i] <= 17) {
142 swap = states[i] - 15 + 4;
143 // Likely wrong
144 } else if (states[i] >= 18 && states[i] <= 20) {
145 swap = states[i] - 18 + 4;
146 } else if (states[i] >= 23 && states[i] <= 25) {
147 swap = states[i] - 23 + 10;
148 // Also likely wrong
149 } else if (states[i] >= 26 && states[i] <= 28) {
150 swap = states[i] - 26 + 10;
151 }
152
153 Assert.a(swap >= 0 && swap <= 12);
154 states[i] = swap;
155 }
156 return states;
157 }
158
159
160 public static void convertSeqFromTricycle13ToInterval13(TrainingSequence<Character> seq) {
161 int len = seq.length();
162 Assert.a(len>=1);
163
164 for (int pos = 0; pos<len; pos++) {
165 int tricycle13y = seq.getY(pos);
166 int interval13y = posTricycle2interval13(pos,tricycle13y);
167 seq.setY(pos,interval13y);
168 }
169 }
170
171 public static String convertSeqFromTricycle13ToInterval13(String seq2) {
172 char[] seq = seq2.toCharArray();
173 int len = seq.length;
174 Assert.a(len>=1);
175
176 for (int pos = 0; pos<len; pos++) {
177 int tricycle13y = char2integer13(seq[pos]);
178 int interval13y = posTricycle2interval13(pos,tricycle13y);
179 seq[pos] = integer132char(interval13y);
180 }
181 return new String(seq);
182 }
183
184 private static char integer132char(int i) {
185 if (i<10) {
186 return (char) ('0'+i);
187 } else if (i<36) {
188 return (char) ('A'+(i-10));
189 } else if (i<62) {
190 return (char) ('a'+(i-36));
191 }
192 Assert.a(false);
193 return 0;
194 }
195
196
197 private static int char2integer13(char x) {
198 int temp = x - '0';
199 if ( (temp<0) || (temp>9)) {
200 temp = x - 'A' + 10;
201 if ( (temp<10) || (temp>35)) {
202 temp = x - 'a' + 36;
203 Assert.a( (temp>=36) && (temp<62), "Offending character was '" + x);
204 }
205 }
206 Assert.a(temp<13,"temp = " + temp + " and x = " + x);
207 return temp;
208 }
209
210 private static int posTricycle2interval13(int pos, int tricycle13y) {
211 int interval13y = 0;
212 switch(tricycle13y) {
213 case 0:
214 // intergenic
215 interval13y = 0;
216 break;
217 case 1:
218 case 2:
219 case 3:
220 // exon plus strand
221 interval13y = ((( pos - (tricycle13y - 1) ) %3 +3) %3) + 1;
222 break;
223 case 4:
224 // intron plus strand
225 interval13y = 6;
226 break;
227 case 5:
228 interval13y = 5;
229 break;
230 case 6:
231 interval13y = 4;
232 break;
233 case 7:
234 case 8:
235 case 9:
236 // exon minus strand
237 interval13y = (pos + (tricycle13y - 7) + 1) % 3 + 7;
238 break;
239 case 10:
240 // intron minus strand
241 interval13y = 12;
242 break;
243 case 11:
244 interval13y = 11;
245 break;
246 case 12:
247 interval13y = 10;
248 break;
249 default:
250 Assert.a(false);
251 }
252 return interval13y;
253 }
254
255
256 // NOTE: it just so happens that the conversion Interval13ToTricycle13 is it's own inverse,
257 // but this is just a coincidence and best programming prcatice to write it out twice.
258 public static void convertSeqFromInterval13ToTricycle13(TrainingSequence<Character> seq) {
259 int len = seq.length();
260 Assert.a(len>=1);
261
262 for (int pos = 0; pos<len; pos++) {
263 int interval13y = seq.getY(pos);
264 int tricycle13y = posInterval2tricycle13(pos,interval13y);
265 seq.setY(pos,tricycle13y);
266 }
267 }
268
269 public static String convertSeqFromInterval13ToTricycle13(String seq2) {
270 char[] seq = seq2.toCharArray();
271 int len = seq.length;
272 Assert.a(len>=1);
273
274 for (int pos = 0; pos<len; pos++) {
275 int interval13y = char2integer13(seq[pos]);
276 int tricycle13y = posInterval2tricycle13(pos,interval13y);
277 seq[pos] = integer132char(tricycle13y);
278 }
279 return new String(seq);
280 }
281
282
283 private static int posInterval2tricycle13(int pos, int interval13y) {
284 int tricycle13y = 0;
285 switch(interval13y) {
286 case 0:
287 // intergenic
288 tricycle13y = 0;
289 break;
290 case 1:
291 case 2:
292 case 3:
293 // exon plus strand
294 tricycle13y = ((( pos - (interval13y - 1) ) %3 +3) %3) + 1;
295 break;
296 case 4:
297 // intron plus strand
298 tricycle13y = 6;
299 break;
300 case 5:
301 tricycle13y = 5;
302 break;
303 case 6:
304 tricycle13y = 4;
305 break;
306 case 7:
307 case 8:
308 case 9:
309 // exon minus strand
310 tricycle13y = (((-pos + (interval13y - 7) + 2) %3 +3) %3) + 7;
311 break;
312 case 10:
313 // intron minus strand
314 tricycle13y = 12;
315 break;
316 case 11:
317 tricycle13y = 11;
318 break;
319 case 12:
320 tricycle13y = 10;
321 break;
322 default:
323 Assert.a(false);
324 }
325 return tricycle13y;
326 }
327
328
329
330
331
332
333 // Given a hidden sequence that has 13 states and values [0, 12], converts that
334 // sequence to a 39 state model with values [0, 38].
335 public static void convertSeqFrom13To39(TrainingSequence<Character> seq)
336 {
337 setStateMap();
338 if (seq.length() < 2) return;
339
340 ArrayList<SeqPair> states = new ArrayList<SeqPair>();
341 int i, state39, total, k, seqIdx, curIdx;
342 int seqLen = seq.length();
343 int startElement = seq.getY(0);
344 int startIndex = 0;
345
346 int prevElement = seq.getY(0);
347 int curElement = seq.getY(1);
348 int nextElement;
349 int prevState = -1;
350
351 for (i=2; i<seqLen; i++)
352 {
353 curIdx = i-1;
354
355 nextElement = seq.getY(i);
356 Assert.a(curElement>=0 && curElement <=12, "invalid character in hidden sequence, '", curElement, "'");
357
358 if (!sameState(startElement, curElement))
359 {
360 // End the previous state
361 state39 = getState39(startElement, prevElement, curElement, prevState);
362 states.add(new SeqPair(state39, (curIdx-1)-startIndex + 1));
363 prevState = state39;
364
365 // Start the current state
366 startElement = curElement;
367 startIndex = curIdx;
368 }
369 prevElement = curElement;
370 curElement = nextElement;
371 }
372
373 // Add last state
374 state39 = getState39(startElement, prevElement, -1, prevState);
375 states.add(new SeqPair(state39, (i-1)-startIndex+1));
376
377 // Verify sequence lengths will be the same
378 total = 0;
379 for (i=0; i<states.size(); i++)
380 total += states.get(i).length;
381 Assert.a(total == seqLen, "Sum of state lengths = " + total + ", Sequence Length = " + seqLen);
382
383 // Set the values in the sequence to be in 39 state model.
384 seqIdx = 0;
385 for (i=0; i<states.size(); i++)
386 {
387 for (k=0; k<states.get(i).length; k++)
388 {
389 seq.setY(seqIdx, states.get(i).state);
390 seqIdx++;
391 }
392 }
393 }
394
395
396 // Given a hidden sequence that has 39 states and values [0, 38], converts that
397 // sequence to a 13 state model with values [0, 12].
398 public static void convertSeqFrom39To13(TrainingSequence<Character> seq)
399 {
400 int len = seq.length();
401 if (len < 1) {return;}
402
403 int[] y = new int[len];
404
405 for (int j=0; j<len; j++) {
406 y[j] = seq.getY(j); }
407
408 convertSeqFrom39To13(y);
409
410 for (int j=0; j<len; j++) {
411 seq.setY(j,y[j]); }
412
413 }
414
415 // Given a hidden sequence that has 39 states and values [0, 38], converts that
416 // sequence to a 13 state model with values [0, 12].
417 public static void convertSeqFrom39To13(int[] seq)
418 {
419 if (seq.length < 1) {return;}
420
421 int seqLen = seq.length;
422 int cur, i, exonPhase;
423 boolean inExon = false;
424 exonPhase = -1;
425
426 cur = seq[0];
427 for (i=1; i<seqLen; i++)
428 {
429 cur = seq[i];
430 Assert.a(cur>=0 && cur <=38, "invalid character in hidden sequence, '", cur, "'");
431
432 if (cur==0) // INTERGENIC, do nothing, 0 in both models
433 {
434 inExon = false;
435 }
436 else if (isIntron39(cur)) // INTRON
437 {
438 seq[i] = convertIntron39To13(cur);
439 inExon = false;
440 }
441 else // EXON
442 {
443 if (inExon) // already been here, just keep cycling through 1, 2, 3, or 9, 8, 7
444 {
445 seq[i] = exonPhase;
446 exonPhase = incrementExonPhase(exonPhase);
447 }
448 else // first time we're entering an exon
449 {
450 inExon = true;
451 exonPhase = convertExon39To13(cur);
452 seq[i] = exonPhase;
453 exonPhase = incrementExonPhase(exonPhase);
454 }
455 }
456 }
457 }
458
459
460 //
461 // SUPPORTING FUNCTIONS FOR 13 -> 39 CONVERSION
462 //
463
464 // Given a start and end state in the 13 state model, returns the state in the 39 state model.
465 private static int getState39(int start, int end, int next, int prevState)
466 {
467 int state39 = -1;
468
469 if (start == 0 && end == 0) state39 = map.get("NTG").intValue();
470 else if (start == 6 && end == 6) state39 = map.get("I0p").intValue();
471 else if (start == 4 && end == 4) state39 = map.get("I1p").intValue();
472 else if (start == 5 && end == 5) state39 = map.get("I2p").intValue();
473 else if (start == 12 && end == 12) state39 = map.get("I0m").intValue();
474 else if (start == 10 && end == 10) state39 = map.get("I1m").intValue();
475 else if (start == 11 && end == 11) state39 = map.get("I2m").intValue();
476
477 else if (start == 1 && end == 3)
478 {
479 if (prevState == map.get("NTG").intValue() && next == 0) state39 = map.get("ENNp").intValue();
480 else if (prevState == map.get("NTG").intValue()) state39 = map.get("EN0p").intValue();
481 else if (next == 0) state39 = map.get("E0Np").intValue();
482 else state39 = map.get("E00p").intValue();
483 }
484 else if (start == 1 && end == 1)
485 {
486 if (prevState == map.get("NTG").intValue()) state39 = map.get("EN1p").intValue();
487 else state39 = map.get("E01p").intValue();
488 }
489 else if (start == 1 && end == 2)
490 {
491 if (prevState == map.get("NTG").intValue()) state39 = map.get("EN2p").intValue();
492 else state39 = map.get("E02p").intValue();
493 }
494 else if (start == 2 && end == 3)
495 {
496 if (next == 0) state39 = map.get("E1Np").intValue();
497 else state39 = map.get("E10p").intValue();
498 }
499 else if (start == 3 && end == 3)
500 {
501 if (next == 0) state39 = map.get("E2Np").intValue();
502 else state39 = map.get("E20p").intValue();
503 }
504 else if (start == 2 && end == 1) state39 = map.get("E11p").intValue();
505 else if (start == 2 && end == 2) state39 = map.get("E12p").intValue();
506 else if (start == 3 && end == 1) state39 = map.get("E21p").intValue();
507 else if (start == 3 && end == 2) state39 = map.get("E22p").intValue();
508
509 else if (start == 9 && end == 7)
510 {
511 if (prevState == map.get("NTG").intValue() && next == 0) state39 = map.get("ENNm").intValue();
512 else if (prevState == map.get("NTG").intValue()) state39 = map.get("E0Nm").intValue();
513 else if (next == 0) state39 = map.get("EN0m").intValue();
514 else state39 = map.get("E00m").intValue();
515 }
516 else if (start == 7 && end == 7)
517 {
518 if (next == 0) state39 = map.get("EN1m").intValue();
519 else state39 = map.get("E01m").intValue();
520 }
521 else if (start == 8 && end == 7)
522 {
523 if (next == 0) state39 = map.get("EN2m").intValue();
524 else state39 = map.get("E02m").intValue();
525 }
526 else if (start == 9 && end == 8)
527 {
528 if (prevState == map.get("NTG").intValue()) state39 = map.get("E1Nm").intValue();
529 else state39 = map.get("E10m").intValue();
530 }
531 else if (start == 9 && end == 9)
532 {
533 if (prevState == map.get("NTG").intValue()) state39 = map.get("E2Nm").intValue();
534 else state39 = map.get("E20m").intValue();
535 }
536 else if (start == 7 && end == 8) state39 = map.get("E11m").intValue();
537 else if (start == 8 && end == 8) state39 = map.get("E12m").intValue();
538 else if (start == 7 && end == 9) state39 = map.get("E21m").intValue();
539 else if (start == 8 && end == 9) state39 = map.get("E22m").intValue();
540
541 if (state39 == -1) Assert.a(false, "start = " + start + " end = " + end);
542 return state39;
543 }
544
545 // Given a two states in the 13 state model, returns true if they are the same state,
546 // else returns false. I.e. 1,2,3 are all positive exons and considered the same state.
547 private static boolean sameState(int state1, int state2)
548 {
549 if (state1 == state2)
550 return true;
551 else if (isPlusExon(state1) && isPlusExon(state2))
552 return true;
553 else if (isMinusExon(state1) && isMinusExon(state2))
554 return true;
555 return false;
556 }
557
558 // Given a state in the 13 state model, returns true if it is an exon on the plus strand.
559 private static boolean isPlusExon(int state)
560 {
561 if (state == 1 || state == 2 || state == 3)
562 return true;
563 return false;
564 }
565
566 // Given a state in the 13 state model, returns true if it is an exon on the minus strand.
567 private static boolean isMinusExon(int state)
568 {
569 if (state == 9 || state == 8 || state == 7)
570 return true;
571 return false;
572 }
573
574 //
575 // SUPPORTING FUNCTIONS FOR 39 -> 13 CONVERSION
576 //
577
578 // Given an INTRON state in the 39 model state, returns the intron state in the 13 model state.
579 private static int convertIntron39To13(int element)
580 {
581 if (element == map.get("I1p").intValue()) return 4; // intron1
582 if (element == map.get("I2p").intValue()) return 5; // intron2
583 if (element == map.get("I0p").intValue()) return 6; // intron3
584 if (element == map.get("I1m").intValue()) return 10; // intron1m
585 if (element == map.get("I2m").intValue()) return 11; // intron2m
586 if (element == map.get("I0m").intValue()) return 12; // intron3m
587
588 Assert.a(false);
589 return -1;
590 }
591
592 // Given an EXON state in the 39 model state, returns the intron state in the 13 model state.
593 private static int convertExon39To13(int element)
594 {
595 int exonPhase = -1;
596
597 if (getStrand39(element) == +1) // plus strand
598 {
599 if (element == map.get("ENNp").intValue()) exonPhase = 1; // exon1
600 else if (element == map.get("EN0p").intValue()) exonPhase = 1; // exon1
601 else if (element == map.get("EN1p").intValue()) exonPhase = 1; // exon1
602 else if (element == map.get("EN2p").intValue()) exonPhase = 1; // exon1
603 else if (element == map.get("E00p").intValue()) exonPhase = 1; // exon1
604 else if (element == map.get("E01p").intValue()) exonPhase = 1; // exon1
605 else if (element == map.get("E02p").intValue()) exonPhase = 1; // exon1
606 else if (element == map.get("E10p").intValue()) exonPhase = 2; // exon2
607 else if (element == map.get("E11p").intValue()) exonPhase = 2; // exon2
608 else if (element == map.get("E12p").intValue()) exonPhase = 2; // exon2
609 else if (element == map.get("E20p").intValue()) exonPhase = 3; // exon3
610 else if (element == map.get("E21p").intValue()) exonPhase = 3; // exon3
611 else if (element == map.get("E22p").intValue()) exonPhase = 3; // exon3
612 else if (element == map.get("E0Np").intValue()) exonPhase = 1; // exon1
613 else if (element == map.get("E1Np").intValue()) exonPhase = 2; // exon2
614 else if (element == map.get("E2Np").intValue()) exonPhase = 3; // exon3
615 }
616 else if (getStrand39(element) == -1) // minus strand
617 {
618 if (element == map.get("ENNm").intValue()) exonPhase = 9; // exon3m
619 else if (element == map.get("EN0m").intValue()) exonPhase = 9; // exon3m
620 else if (element == map.get("EN1m").intValue()) exonPhase = 7; // exon1m
621 else if (element == map.get("EN2m").intValue()) exonPhase = 8; // exon2m
622 else if (element == map.get("E00m").intValue()) exonPhase = 9; // exon3m
623 else if (element == map.get("E01m").intValue()) exonPhase = 7; // exon1m
624 else if (element == map.get("E02m").intValue()) exonPhase = 8; // exon2m
625 else if (element == map.get("E10m").intValue()) exonPhase = 9; // exon3m
626 else if (element == map.get("E11m").intValue()) exonPhase = 7; // exon1m
627 else if (element == map.get("E12m").intValue()) exonPhase = 8; // exon2m
628 else if (element == map.get("E20m").intValue()) exonPhase = 9; // exon3m
629 else if (element == map.get("E21m").intValue()) exonPhase = 7; // exon1m
630 else if (element == map.get("E22m").intValue()) exonPhase = 8; // exon2m
631 else if (element == map.get("E0Nm").intValue()) exonPhase = 9; // exon3m
632 else if (element == map.get("E1Nm").intValue()) exonPhase = 9; // exon3m
633 else if (element == map.get("E2Nm").intValue()) exonPhase = 9; // exon3m
634 }
635 if (exonPhase == -1) Assert.a(false);
636 return exonPhase;
637 }
638
639 // Given an exon in the 39 state model, returns the strand (+1 or -1)
640 private static int getStrand39(int element)
641 {
642 if (7 <= element && element <= 22)
643 return (+1);
644 else if (23 <= element && element <= 38)
645 return (-1);
646
647 Assert.a(false);
648 return 0;
649 }
650
651 // Given an element in the 39 model state, returns true if this element
652 // is an intron, else returns false
653 private static boolean isIntron39(int element)
654 {
655 if (1 <= element && element <= 6)
656 return true;
657 else
658 return false;
659 }
660
661 private static int incrementExonPhase(int phase)
662 {
663 if (phase == 1 || phase == 2 || phase == 3)
664 {
665 phase++;
666 if (phase == 4) phase = 1;
667 }
668 else if (phase == 9 || phase == 8 || phase == 7)
669 {
670 phase--;
671 if (phase == 6) phase = 9;
672 }
673 else
674 {
675 Assert.a(false);
676 }
677 return phase;
678 }
679
680 //
681 // CONVERT A HIDDEN SEQUENCE TO A GFF FILE
682 //
683
684 // This function converts a 13 state model hidden sequence to a GFF file.
685 // Used for debugging only.
686 public static void writeHiddenSequenceGFF(TrainingSequence<Character> refStates, String filename) throws IOException
687 {
688 int i, ref, exonStart, exonEnd, geneNum;
689 boolean inPlusGene = false;
690 boolean inMinusGene = false;
691 geneNum = 0;
692 exonStart = exonEnd = geneNum = 0;
693 Writer fout = new BufferedWriter(new FileWriter(filename));
694
695 for (i=0; i<refStates.length(); i++)
696 {
697 ref = refStates.getY(i);
698
699 if (ref == 1 || ref == 2 || ref == 3)
700 {
701 if (!inPlusGene)
702 {
703 exonStart = i+1;
704 inPlusGene = true;
705 }
706 }
707 else if (ref == 7 || ref == 8 || ref == 9)
708 {
709 if (!inMinusGene)
710 {
711 exonStart = i+1;
712 inMinusGene = true;
713 }
714 }
715 else if (inPlusGene && (ref == 4 || ref == 5 || ref == 6) )
716 {
717 exonEnd = i;
718 fout.write("XXX\tghmm\tENNp\t" + exonStart + "\t" + exonEnd + "\t0\t+\t0\tgene_" + geneNum + "\n");
719 inPlusGene = false;
720 }
721 else if (inMinusGene && (ref == 10 || ref == 11 || ref == 12) )
722 {
723 exonEnd = i;
724 fout.write("XXX\tghmm\tENNm\t" + exonStart + "\t" + exonEnd + "\t0\t-\t0\tgene_" + geneNum + "\n");
725 inMinusGene = false;
726 }
727 else
728 {
729 if (inPlusGene || inMinusGene) // was in gene at last nucleotide, but not in any more
730 {
731 exonEnd = i;
732
733 if (inPlusGene)
734 fout.write("XXX\tghmm\tENNp\t" + exonStart + "\t" + exonEnd + "\t0\t+\t0\tgene_" + geneNum + "\n");
735 else
736 fout.write("XXX\tghmm\tENNm\t" + exonStart + "\t" + exonEnd + "\t0\t-\t0\tgene_" + geneNum + "\n");
737 inPlusGene = false;
738 inMinusGene = false;
739 geneNum++;
740 }
741 }
742
743 }
744 fout.close();
745 }
746
747 // This function converts a 39 state model hidden sequence to a GFF file.
748 // Used for debugging only.
749 public void writeHiddenSequence39GFF(TrainingSequence<Character> refStates, String filename) throws IOException
750 {
751 int i, ref, exonStart, exonEnd, geneNum;
752 boolean inPlusGene = false;
753 boolean inMinusGene = false;
754 geneNum = 0;
755 exonStart = exonEnd = geneNum = 0;
756 Writer fout = new BufferedWriter(new FileWriter(filename));
757 for (i=0; i<refStates.length(); i++)
758 {
759 ref = refStates.getY(i);
760
761 if (isPlusExon39(ref)) // plus exon
762 {
763 if (!inPlusGene)
764 {
765 exonStart = i+1;
766 inPlusGene = true;
767 }
768 }
769 else if (isMinusExon39(ref)) // minus exon
770 {
771 if (!inMinusGene)
772 {
773 exonStart = i+1;
774 inMinusGene = true;
775 }
776 }
777 else if (inPlusGene && isPlusIntron39(ref) ) // plus intron
778 {
779 exonEnd = i;
780 fout.write("XXX\tghmm\t" + stateIdxToString(refStates.getY(exonStart)) + "\t" + exonStart + "\t" + exonEnd + "\t0\t+\t0\tgene_" + geneNum + "\n");
781 inPlusGene = false;
782 }
783 else if (inMinusGene && isMinusIntron39(ref) ) // minus intron
784 {
785 exonEnd = i;
786 fout.write("XXX\tghmm\t" + stateIdxToString(refStates.getY(exonStart)) + "\t" + exonStart + "\t" + exonEnd + "\t0\t-\t0\tgene_" + geneNum + "\n");
787 inMinusGene = false;
788 }
789 else
790 {
791 if (inPlusGene || inMinusGene) // was in gene at last nucleotide, but not in any more
792 {
793 exonEnd = i;
794
795 if (inPlusGene)
796 fout.write("XXX\tghmm\t" + stateIdxToString(refStates.getY(exonStart)) + "\t" + exonStart + "\t" + exonEnd + "\t0\t+\t0\tgene_" + geneNum + "\n");
797 else
798 fout.write("XXX\tghmm\t" + stateIdxToString(refStates.getY(exonStart)) + "\t" + exonStart + "\t" + exonEnd + "\t0\t-\t0\tgene_" + geneNum + "\n");
799 inPlusGene = false;
800 inMinusGene = false;
801 geneNum++;
802 }
803 }
804
805 }
806 fout.close();
807 }
808
809 // Returns true if state is a exon in the 39 state model, else returns false
810 private boolean isPlusExon39(int state)
811 {
812 if (7 <= state && state <= 22)
813 return true;
814 return false;
815 }
816 private boolean isMinusExon39(int state)
817 {
818 if (23 <= state && state <= 38)
819 return true;
820 return false;
821 }
822 private boolean isPlusIntron39(int state)
823 {
824 if (1 <= state && state <= 3)
825 return true;
826 return false;
827 }
828 private boolean isMinusIntron39(int state)
829 {
830 if (4 <= state && state <= 6)
831 return true;
832 return false;
833 }
834
835 //
836 // COMMON FUNCTIONS AND STRUCTS
837 //
838
839 // NOTE: This info is copied from GHMM. It likely should not be in two places.
840 private static void setStateMap()
841 {
842 int statenum = 0;
843
844 map.put("NTG", new Integer(statenum)); statenum++;
845 map.put("I0p", new Integer(statenum)); statenum++;
846 map.put("I1p", new Integer(statenum)); statenum++;
847 map.put("I2p", new Integer(statenum)); statenum++;
848 map.put("I0m", new Integer(statenum)); statenum++;
849 map.put("I1m", new Integer(statenum)); statenum++;
850 map.put("I2m", new Integer(statenum)); statenum++;
851 map.put("ENNp", new Integer(statenum)); statenum++;
852 map.put("EN0p", new Integer(statenum)); statenum++;
853 map.put("EN1p", new Integer(statenum)); statenum++;
854 map.put("EN2p", new Integer(statenum)); statenum++;
855 map.put("E00p", new Integer(statenum)); statenum++;
856 map.put("E01p", new Integer(statenum)); statenum++;
857 map.put("E02p", new Integer(statenum)); statenum++;
858 map.put("E10p", new Integer(statenum)); statenum++;
859 map.put("E11p", new Integer(statenum)); statenum++;
860 map.put("E12p", new Integer(statenum)); statenum++;
861 map.put("E20p", new Integer(statenum)); statenum++;
862 map.put("E21p", new Integer(statenum)); statenum++;
863 map.put("E22p", new Integer(statenum)); statenum++;
864 map.put("E0Np", new Integer(statenum)); statenum++;
865 map.put("E1Np", new Integer(statenum)); statenum++;
866 map.put("E2Np", new Integer(statenum)); statenum++;
867 map.put("ENNm", new Integer(statenum)); statenum++;
868 map.put("EN0m", new Integer(statenum)); statenum++;
869 map.put("EN1m", new Integer(statenum)); statenum++;
870 map.put("EN2m", new Integer(statenum)); statenum++;
871 map.put("E00m", new Integer(statenum)); statenum++;
872 map.put("E01m", new Integer(statenum)); statenum++;
873 map.put("E02m", new Integer(statenum)); statenum++;
874 map.put("E10m", new Integer(statenum)); statenum++;
875 map.put("E11m", new Integer(statenum)); statenum++;
876 map.put("E12m", new Integer(statenum)); statenum++;
877 map.put("E20m", new Integer(statenum)); statenum++;
878 map.put("E21m", new Integer(statenum)); statenum++;
879 map.put("E22m", new Integer(statenum)); statenum++;
880 map.put("E0Nm", new Integer(statenum)); statenum++;
881 map.put("E1Nm", new Integer(statenum)); statenum++;
882 map.put("E2Nm", new Integer(statenum)); statenum++;
883 }
884
885 private String stateIdxToString(int state)
886 {
887 switch(state)
888 {
889 case 0: return "NTG";
890 case 1: return "I0p";
891 case 2: return "I1p";
892 case 3: return "I2p";
893 case 4: return "I0m";
894 case 5: return "I1m";
895 case 6: return "I2m";
896 case 7: return "ENNp";
897 case 8: return "EN0p";
898 case 9: return "EN1p";
899 case 10: return "EN2p";
900 case 11: return "E00p";
901 case 12: return "E01p";
902 case 13: return "E02p";
903 case 14: return "E10p";
904 case 15: return "E11p";
905 case 16: return "E12p";
906 case 17: return "E20p";
907 case 18: return "E21p";
908 case 19: return "E22p";
909 case 20: return "E0Np";
910 case 21: return "E1Np";
911 case 22: return "E2Np";
912 case 23: return "ENNm";
913 case 24: return "EN0m";
914 case 25: return "EN1m";
915 case 26: return "EN2m";
916 case 27: return "E00m";
917 case 28: return "E01m";
918 case 29: return "E02m";
919 case 30: return "E10m";
920 case 31: return "E11m";
921 case 32: return "E12m";
922 case 33: return "E20m";
923 case 34: return "E21m";
924 case 35: return "E22m";
925 case 36: return "E0Nm";
926 case 37: return "E1Nm";
927 case 38: return "E2Nm";
928 }
929 return "XXX";
930 }
931
932 private static class SeqPair
933 {
934 public int state;
935 public int length;
936 public SeqPair(int st, int len) {state=st; length=len; }
937 }
938
939
940 }