001 package calhoun.analysis.crf.features.interval29;
002
003 import java.util.List;
004
005 import calhoun.analysis.crf.AbstractFeatureManager;
006 import calhoun.analysis.crf.FeatureList;
007 import calhoun.analysis.crf.ModelManager;
008 import calhoun.analysis.crf.features.supporting.LogProbLookup;
009 import calhoun.analysis.crf.io.InputSequence;
010 import calhoun.analysis.crf.io.TrainingSequence;
011 import calhoun.util.Assert;
012
013
014 public abstract class ReferenceBasePredictorInterval29Base extends AbstractFeatureManager<Character> {
015
016 private static final long serialVersionUID = 8194502006226691957L;
017 ModelManager model;
018 int startIx;
019
020 boolean multipleFeatures = false;
021
022 double pseudoCounts;
023 int lookback;
024
025 LogProbLookup intron;
026 LogProbLookup intergenic;
027 LogProbLookup[] exonic;
028
029 public ReferenceBasePredictorInterval29Base() {
030 }
031
032 public String getFeatureName(int featureIndex) {
033 if(multipleFeatures) {
034 int feat = featureIndex - startIx;
035 String table = "";
036 switch(feat) {
037 case 0:
038 table = "intergenic";
039 break;
040 case 1:
041 table = "exon";
042 break;
043 case 2:
044 table = "intron";
045 break;
046 case 3:
047 table = "minus exon";
048 break;
049 case 4:
050 table = "minus intron";
051 break;
052 default:
053 Assert.a(false);
054 }
055 return "referenceBasePredictorInterval29 "+table;
056 }
057 else {
058 return "referenceBasePredictorInterval29";
059 }
060 }
061
062 public int getNumFeatures() {
063 return multipleFeatures ? 5 : 1;
064 }
065
066 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
067 startIx = startingIndex;
068
069 model = modelInfo;
070 Interval29Tools.verify(modelInfo);
071
072 pseudoCounts = 1.0;
073 lookback = 3;
074
075
076 // Construct the space for the lookup tables.
077 exonic = new LogProbLookup[3];
078 for (int j=0; j<3; j++) {
079 exonic[j] = new LogProbLookup(lookback,pseudoCounts);
080 }
081 intron = new LogProbLookup(lookback,pseudoCounts);
082 intergenic = new LogProbLookup(lookback,pseudoCounts);
083
084
085 for(TrainingSequence<? extends Character> seq : data) {
086 for (int pos=0; pos<seq.length(); pos++) {
087
088 int state = seq.getY(pos);
089 switch(state) {
090 case(0):
091 case(13):
092 case(14):
093 case(21):
094 case(22):
095 intergenic.increment(seq,pos,true);
096 intergenic.increment(seq,pos,false);
097 break;
098 case(1):
099 case(2):
100 case(3):
101 exonic[((pos-state+1)%3+3)%3].increment(seq,pos,true);
102 break;
103 case(4):
104 case(5):
105 case(6):
106 case(15):
107 case(16):
108 case(17):
109 case(18):
110 case(19):
111 case(20):
112 intron.increment(seq,pos,true);
113 break;
114 case(7):
115 case(8):
116 case(9):
117 exonic[((-pos+state+1)%3+3)%3].increment(seq,pos,false);
118 break;
119 case(10):
120 case(11):
121 case(12):
122 case(23):
123 case(24):
124 case(25):
125 case(26):
126 case(27):
127 case(28):
128 intron.increment(seq,pos,false);
129 break;
130 default:
131 Assert.a(false);
132 }
133 }
134 }
135
136 for (int j=0; j<3; j++) {
137 exonic[j].finalize();
138 }
139 intron.finalize();
140 intergenic.finalize();
141 }
142
143
144 public void evaluateNode(InputSequence<? extends Character> seq, int pos, int state, FeatureList result) {
145 double evaluation=0;
146
147 int indexOffset = Integer.MIN_VALUE;
148 int phase;
149 switch(state) {
150 case(0):
151 case(13):
152 case(14):
153 case(21):
154 case(22):
155 evaluation = intergenic.lookup(seq,pos,true);
156 indexOffset = 0;
157 break;
158 case(1):
159 case(2):
160 case(3):
161 phase = ((pos-state+1)%3+3)%3;
162 evaluation = exonic[phase].lookup(seq,pos,true);
163 indexOffset = 1;// + phase;
164 break;
165 case(4):
166 case(5):
167 case(6):
168 case(15):
169 case(16):
170 case(17):
171 case(18):
172 case(19):
173 case(20):
174 evaluation = intron.lookup(seq,pos,true);
175 indexOffset = 2;
176 break;
177 case(7):
178 case(8):
179 case(9):
180 phase = ((-pos+state+1)%3+3)%3;
181 evaluation = exonic[phase].lookup(seq,pos,false);
182 indexOffset = 3;// + phase;
183 break;
184 case(10):
185 case(11):
186 case(12):
187 case(23):
188 case(24):
189 case(25):
190 case(26):
191 case(27):
192 case(28):
193 evaluation = intron.lookup(seq,pos,false);
194 indexOffset = 4;
195 break;
196 default:
197 Assert.a(false);
198 }
199
200 result.addFeature(startIx + (multipleFeatures ? indexOffset : 0), evaluation);
201 }
202
203 /** if true, a separate feature index is used for each state, creating 21 weights instead of 1.
204 * @return returns true if a separate feature index is used for each state
205 */
206 public boolean isMultipleFeatures() {
207 return multipleFeatures;
208 }
209
210 /**
211 * @param multipleFeatures The multipleFeatures to set.
212 */
213 public void setMultipleFeatures(boolean weightPerState) {
214 this.multipleFeatures = weightPerState;
215 }
216
217
218 }