001 package calhoun.analysis.crf.features.interval29;
002
003 import java.util.List;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.AbstractFeatureManager;
009 import calhoun.analysis.crf.CacheStrategySpec;
010 import calhoun.analysis.crf.FeatureList;
011 import calhoun.analysis.crf.FeatureManagerEdge;
012 import calhoun.analysis.crf.ModelManager;
013 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014 import calhoun.analysis.crf.features.supporting.PWMLookup;
015 import calhoun.analysis.crf.io.InputSequence;
016 import calhoun.analysis.crf.io.TrainingSequence;
017 import calhoun.util.Assert;
018
019 public class PWMInterval29 extends AbstractFeatureManager<Character> implements FeatureManagerEdge<Character> {
020 private static final long serialVersionUID = -7659288739348604129L;
021 private static final Log log = LogFactory.getLog(PWMInterval29.class);
022 boolean debug = log.isDebugEnabled();
023
024 boolean multipleFeatures = false;
025
026 int startIx; // The index of the first feature managed by this FeatureManager
027 ModelManager model;
028
029
030 PWMLookup start,stop;
031 PWMLookup[] donor,acceptor;
032 double pseudoCounts;
033
034
035 public PWMInterval29() {
036 }
037
038 public int getNumFeatures() {
039 return multipleFeatures ? 8 : 1;
040 }
041
042 public String getFeatureName(int featureIndex) {
043 if(multipleFeatures) {
044 String feature = "";
045 switch(featureIndex - startIx) {
046 case 0:
047 feature = "start";
048 break;
049 case 1:
050 feature = "stop";
051 break;
052 case 2:
053 case 3:
054 case 4:
055 feature = "donor"+(featureIndex - startIx - 2);
056 case 5:
057 case 6:
058 case 7:
059 feature = "acceptor"+(featureIndex - startIx - 5);
060 }
061 return "PwmFeatureInterval29 - "+feature;
062 }
063 else {
064 Assert.a(featureIndex == startIx);
065 return "PwmFeatureInterval29";
066 }
067 }
068
069 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
070 startIx = startingIndex;
071 model = modelInfo;
072
073 Interval29Tools.verify(modelInfo);
074
075
076 // Construct the space for the lookup tables.
077 pseudoCounts = 1.0;
078 donor = new PWMLookup[3];
079 acceptor = new PWMLookup[3];
080 for (int j=0; j<3; j++) {
081 donor[j] = new PWMLookup(Interval29Model.getPadExon3prime(),Interval29Model.getPadIntron5prime(),pseudoCounts); // donor signal xxx|GTxxxx
082 acceptor[j] = new PWMLookup(Interval29Model.getPadIntron3prime(),Interval29Model.getPadExon5prime(),pseudoCounts); // acceptor signal xxxxxxxAG|xxxxxx
083 }
084 // Note: start PWM and stop PWM must extend equally far into the intergenic space, so that can set pads
085 // stop and donor must also extend same amount into exon
086 // start and acceptor must extend same amount into exon
087 start = new PWMLookup(Interval29Model.getPadIntergenic(),Interval29Model.getPadExon5prime(),pseudoCounts); // start signal xxxxxxxxx|ATGxxx
088 stop = new PWMLookup(Interval29Model.getPadExon3prime(),Interval29Model.getPadIntergenic(),pseudoCounts); // stop signal xxx|TAGxxxxxx
089
090
091 // Increment the lookup tables below
092 for(TrainingSequence<? extends Character> seq : data) {
093 for (int pos=1; pos<seq.length(); pos++) { // note start at one not zero, so can look back at prevState
094
095 int state = seq.getY(pos);
096 int prevState = seq.getY(pos-1);
097 int iind;
098 switch(Interval29Tools.edgeConstraints[prevState*Interval29Tools.numStates + state]) {
099 case NONE:
100 case PCODE:
101 case MCODE:
102 break;
103 case NEVER:
104 Assert.a(false,"pos = "+pos+" prevState = " + modelInfo.getStateName(prevState) + " State = " + modelInfo.getStateName(state)); // A nice side effect of making sure the input sequence is legal, can omit this if you want to.
105 break;
106 case PSTART:
107 start.increment(seq, pos,true);
108 break;
109 case PDON:
110 // y(k) : k = i-j (mod 3)
111 iind = Interval29Tools.check012(state-15);
112 donor[iind].increment(seq,pos,true);
113 break;
114 case PACC:
115 iind = Interval29Tools.check012(prevState-4);
116 acceptor[iind].increment(seq,pos,true);
117 break;
118 case PSTOP:
119 stop.increment(seq,pos,true);
120 break;
121 case MSTART:
122 start.increment(seq,pos,false);
123 break;
124 case MDON:
125 iind = Interval29Tools.check012(prevState-10);
126 donor[iind].increment(seq,pos,false);
127 break;
128 case MACC:
129 // y(k) : k = i+j (mod 3)
130 iind = Interval29Tools.check012(state-23);
131 acceptor[iind].increment(seq,pos,false);
132 break;
133 case MSTOP:
134 stop.increment(seq,pos,false);
135 break;
136 case PKEEPE:
137 case PKEEPI:
138 case MKEEPE:
139 case MKEEPI:
140 case PSTOPPED:
141 case MSTARTED:
142 case PWILLSTART:
143 case MWILLSTOP:
144 break;
145 default:
146 Assert.a(false); // We should have a complete enumeration of possibilities above.
147 }
148 }
149 }
150
151 for (int j=0; j<3; j++) {
152 donor[j].completeCounts();
153 acceptor[j].completeCounts();
154 }
155 start.completeCounts();
156 stop.completeCounts();
157 }
158
159
160 public void evaluateEdge(InputSequence<? extends Character> seq, int pos, int previousState, int state, FeatureList result) {
161 double val = 0.0;
162
163 int featureIndex = Integer.MIN_VALUE;
164 int iind;
165 switch(Interval29Tools.edgeConstraints[previousState*Interval29Tools.numStates + state]) {
166 case NONE:
167 case PCODE:
168 case MCODE:
169 break;
170 case NEVER:
171 Assert.a(false); // A nice side effect of making sure the input sequence is legal, can omit this if you want to.
172 break;
173 case PSTART:
174 featureIndex = 0;
175 val = start.lookup(seq, pos,true);
176 break;
177 case PDON:
178 iind = Interval29Tools.check012(state-15);
179 featureIndex = 2+iind;;
180 val = donor[iind].lookup(seq,pos,true);
181 break;
182 case PACC:
183 iind = Interval29Tools.check012(previousState-4);
184 featureIndex = 5+iind;;
185 val = acceptor[iind].lookup(seq,pos,true);
186 break;
187 case PSTOP:
188 featureIndex = 1;
189 val = stop.lookup(seq,pos,true);
190 break;
191 case MSTART:
192 featureIndex = 0;
193 val = start.lookup(seq,pos,false);
194 break;
195 case MDON:
196 iind = Interval29Tools.check012(previousState-10);
197 featureIndex = 2+iind;;
198 val = donor[iind].lookup(seq,pos,false);
199 break;
200 case MACC:
201 iind = Interval29Tools.check012(state-23);
202 featureIndex = 5+iind;;
203 val = acceptor[iind].lookup(seq,pos,false);
204 break;
205 case MSTOP:
206 featureIndex = 1;
207 val = stop.lookup(seq,pos,false);
208 break;
209 case PKEEPE:
210 case PKEEPI:
211 case MKEEPE:
212 case MKEEPI:
213 case PSTOPPED:
214 case MSTARTED:
215 case PWILLSTART:
216 case MWILLSTOP:
217 break;
218 default:
219 Assert.a(false); // We should have a complete enumeration of possibilities above.
220
221 }
222 Assert.a(val<=0);
223 result.addFeature(startIx + (multipleFeatures ? featureIndex : 0),val);
224 }
225
226 public CacheStrategySpec getCacheStrategy() {
227 return new CacheStrategySpec(CacheStrategy.UNSPECIFIED);
228 }
229
230 /**
231 * @return Returns the multipleFeatures.
232 */
233 public boolean isMultipleFeatures() {
234 return multipleFeatures;
235 }
236
237 /**
238 * @param multipleFeatures The multipleFeatures to set.
239 */
240 public void setMultipleFeatures(boolean multipleFeatures) {
241 this.multipleFeatures = multipleFeatures;
242 }
243 }
244