001 package calhoun.analysis.crf.features.interval29;
002
003 import java.util.ArrayList;
004 import java.util.List;
005
006 import org.apache.commons.logging.Log;
007 import org.apache.commons.logging.LogFactory;
008
009 import calhoun.analysis.crf.ModelManager;
010 import calhoun.analysis.crf.io.TrainingSequence;
011 import calhoun.util.Assert;
012
013 public class Interval29Tools {
014 private static final Log log = LogFactory.getLog(Interval29Tools.class);
015
016 static protected enum Constraint {NONE, NEVER, PSTART, PDON, PACC, PSTOP, MSTART, MDON, MACC, MSTOP, PCODE, MCODE, PKEEPE, PKEEPI, MKEEPE, MKEEPI, PSTOPPED, MSTARTED, PWILLSTART, MWILLSTOP};
017 static protected Constraint[] edgeConstraints;
018 static protected Constraint[] nodeConstraints;
019 static int numStates;
020
021 static {
022 log.debug("Setting up constraints in Interval29Tools");
023
024 numStates = 29;
025
026 // Setup the node constraints
027 nodeConstraints = new Constraint[numStates];
028 for (int j=0; j<numStates; j++) {
029 nodeConstraints[j] = Constraint.NONE;
030 }
031 nodeConstraints[1] = Constraint.PCODE;
032 nodeConstraints[2] = Constraint.PCODE;
033 nodeConstraints[3] = Constraint.PCODE;
034 nodeConstraints[7] = Constraint.MCODE;
035 nodeConstraints[8] = Constraint.MCODE;
036 nodeConstraints[9] = Constraint.MCODE;
037 log.debug("The node constraints are as follows:");
038 for (int i=0; i<numStates; i++) {
039 log.debug(" " + i + " -- " + nodeConstraints[i]);
040 }
041
042 // setup the edge constraints
043 // edgeConstraint[i] is the constraint on the edge
044 // from state floor(i/numStates)
045 // to state i%numStates
046 edgeConstraints = new Constraint[numStates*numStates];
047
048 // The transition is impossible except when explicitly allowed below
049 for(int i=0; i<numStates; ++i) {
050 for(int j=0; j<numStates; ++j) {
051 edgeConstraints[i*numStates + j] = Constraint.NEVER;
052 }
053 }
054
055 // By default, self-transitions are allowed
056 for(int i=0; i<numStates; i++) {
057 edgeConstraints[i*numStates + i] = Constraint.NONE;
058 }
059 // e-ig -> integenic
060 edgeConstraints[(14)*numStates + (0)] = Constraint.PSTOPPED;
061 // em-ig -> intergenic
062 edgeConstraints[(22)*numStates + (0)] = Constraint.MSTARTED;
063 for(int i=0; i<3; i++) {
064 // intergenic -> ig-e
065 edgeConstraints[(0)*numStates + (i+13)] = Constraint.PWILLSTART; //Constraint.PSTART;
066 // intergenic -> ig-em
067 edgeConstraints[(0)*numStates + (i+21)] = Constraint.MWILLSTOP; //Constraint.MSTOP;
068
069 // Put constraints on EXON SIDE of intergenic-exon boundaries
070 // ig-e_-> e_i
071 edgeConstraints[(13)*numStates + (i+1)] = Constraint.PSTART;
072 // ig-em -> e_im
073 edgeConstraints[(21)*numStates + (i+7)] = Constraint.MSTOP;
074 // e_i -> e-ig
075 edgeConstraints[(i+1)*numStates + (14)] = Constraint.PSTOP;
076 // e_im -> em-ig
077 edgeConstraints[(i+7)*numStates + (22)] = Constraint.MSTART;
078
079 // exon-exon
080 // e_i -> e_i
081 edgeConstraints[(i+1)*numStates + (i+1)] = Constraint.PCODE;
082 // e_im -> e_im
083 edgeConstraints[(i+7)*numStates + (i+7)] = Constraint.MCODE;
084
085 // Put constraints on BOTH SIDES of intron-exon boundaries
086 for(int j=0; j<3; j++) {
087 // e_i -> e-i_j
088 edgeConstraints[(i+1)*numStates + (j+15)] = Constraint.PDON;
089 // i_i -> i-e_j
090 edgeConstraints[(i+4)*numStates + (j+18)] = Constraint.PACC;
091 // e_im -> em-i_jm
092 edgeConstraints[(i+7)*numStates + (j+23)] = Constraint.MACC;
093 // i_im -> im-e_jm
094 edgeConstraints[(i+10)*numStates + (j+26)] = Constraint.MDON;
095 }
096 // e-i_i -> i_i (intron_i, abbr.)
097 edgeConstraints[(i+15)*numStates + (i+4)] = Constraint.PKEEPI;
098 // i-e_i -> e_i
099 edgeConstraints[(i+18)*numStates + (i+1)] = Constraint.PKEEPE;
100 // em-i_im -> i_im
101 edgeConstraints[(i+23)*numStates + (i+10)] = Constraint.MKEEPI;
102 // im-e_im -> e_im
103 edgeConstraints[(i+26)*numStates + (i+7)] = Constraint.MKEEPE;
104 }
105
106 // log.warn("The transition constraints are as follows:");
107 // for (int i=0; i<numStates; i++) {
108 // String s = "";
109 // for (int j=0; j<numStates; j++) {
110 // s += edgeConstraints[i*numStates + j] + "\t";
111 // }
112 // System.out.println(s);
113 // System.out.println("");
114 // }
115 }
116
117 static protected int check012(int x) {
118 Assert.a(x>=0, "x is " + x);
119 Assert.a(x<=2, "x is " + x);
120 return x;
121 }
122
123 static protected void verify(ModelManager modelInfo) {
124 Assert.a(modelInfo.getNumStates()==29);
125
126 Assert.a(modelInfo.getStateName(0).equals("intergenic"));
127 Assert.a(modelInfo.getStateName(1).equals("exon0"));
128 Assert.a(modelInfo.getStateName(2).equals("exon1"));
129 Assert.a(modelInfo.getStateName(3).equals("exon2"));
130 Assert.a(modelInfo.getStateName(4).equals("intron0"));
131 Assert.a(modelInfo.getStateName(5).equals("intron1"));
132 Assert.a(modelInfo.getStateName(6).equals("intron2"));
133 Assert.a(modelInfo.getStateName(7).equals("exon0m"));
134 Assert.a(modelInfo.getStateName(8).equals("exon1m"));
135 Assert.a(modelInfo.getStateName(9).equals("exon2m"));
136 Assert.a(modelInfo.getStateName(10).equals("intron0m"));
137 Assert.a(modelInfo.getStateName(11).equals("intron1m"));
138 Assert.a(modelInfo.getStateName(12).equals("intron2m"));
139 // XXX: add Asserts for rest of states
140 }
141
142 static protected List<TrainingSequence<?>> checkValidTransitions(List<? extends TrainingSequence<?>> data) {
143 List<TrainingSequence<?>> goodData = new ArrayList<TrainingSequence<?>>();
144 for(TrainingSequence<?> seq : data) {
145 boolean validSequence = true;
146 for (int pos=1; pos<seq.length(); pos++) { // note start at one not zero, so can look back at prevState
147 int state = seq.getY(pos);
148 int prevState = seq.getY(pos-1);
149 if (Interval29Tools.edgeConstraints[prevState*Interval29Tools.numStates + state] == Interval29Tools.Constraint.NEVER) {
150 System.out.println("bad: " + prevState + " " + state);
151 validSequence = false;
152 //Assert.a(false,"pos = "+pos+" prevState = " + modelInfo.getStateName(prevState) + " State = " + modelInfo.getStateName(state)); // A nice side effect of making sure the input sequence is legal, can omit this if you want to.
153 break;
154 }
155 }
156 if (validSequence) {
157 goodData.add(seq);
158 }
159 }
160 return goodData;
161 }
162 }