001    package calhoun.analysis.crf.features.interval29;
002    
003    import java.util.List;
004    
005    import calhoun.analysis.crf.AbstractFeatureManager;
006    import calhoun.analysis.crf.FeatureList;
007    import calhoun.analysis.crf.ModelManager;
008    import calhoun.analysis.crf.features.supporting.LogProbLookup;
009    import calhoun.analysis.crf.io.InputSequence;
010    import calhoun.analysis.crf.io.TrainingSequence;
011    import calhoun.util.Assert;
012    
013    
014    public abstract class ReferenceBasePredictorInterval29Base extends AbstractFeatureManager<Character> {
015    
016            private static final long serialVersionUID = 8194502006226691957L;
017            ModelManager model;
018            int startIx;
019                    
020            boolean multipleFeatures = false;
021            
022            double pseudoCounts;
023            int lookback;
024            
025            LogProbLookup   intron;
026            LogProbLookup   intergenic;
027            LogProbLookup[] exonic;
028            
029            public ReferenceBasePredictorInterval29Base() {
030            }
031    
032            public String getFeatureName(int featureIndex) {
033                    if(multipleFeatures) {
034                            int feat = featureIndex - startIx;
035                            String table = "";
036                            switch(feat) {
037                                    case 0:
038                                            table = "intergenic";
039                                            break;
040                                    case 1:
041                                            table = "exon";
042                                            break;
043                                    case 2:
044                                            table = "intron";
045                                            break;
046                                    case 3:
047                                            table = "minus exon";
048                                            break;
049                                    case 4:
050                                            table = "minus intron";
051                                            break;
052                                    default:
053                                            Assert.a(false);
054                                    }       
055                            return "referenceBasePredictorInterval29 "+table;
056                    }
057                    else {
058                            return "referenceBasePredictorInterval29";
059                    }
060            }
061    
062            public int getNumFeatures() {
063                    return multipleFeatures ? 5 : 1;
064            }
065            
066            public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
067                    startIx = startingIndex;
068                    
069                    model = modelInfo;
070                    Interval29Tools.verify(modelInfo);
071    
072                    pseudoCounts = 1.0;     
073                    lookback = 3;
074    
075                    
076                    // Construct the space for the lookup tables.
077                    exonic = new LogProbLookup[3];
078                    for (int j=0; j<3; j++) {
079                            exonic[j] = new LogProbLookup(lookback,pseudoCounts);
080                    }
081                    intron     = new LogProbLookup(lookback,pseudoCounts);
082                    intergenic = new LogProbLookup(lookback,pseudoCounts);
083    
084    
085                    for(TrainingSequence<? extends Character> seq : data) {
086                            for (int pos=0; pos<seq.length(); pos++) {
087                                    
088                                    int state = seq.getY(pos);
089                                    switch(state) {
090                                    case(0):
091                                    case(13):
092                                    case(14):       
093                                    case(21):
094                                    case(22):                                       
095                                            intergenic.increment(seq,pos,true);
096                                            intergenic.increment(seq,pos,false);
097                                            break;
098                                    case(1):
099                                    case(2):
100                                    case(3):
101                                            exonic[((pos-state+1)%3+3)%3].increment(seq,pos,true);
102                                            break;
103                                    case(4):
104                                    case(5):
105                                    case(6):
106                                    case(15):
107                                    case(16):
108                                    case(17):
109                                    case(18):
110                                    case(19):
111                                    case(20):                                       
112                                            intron.increment(seq,pos,true);
113                                            break;
114                                    case(7):
115                                    case(8):
116                                    case(9):
117                                            exonic[((-pos+state+1)%3+3)%3].increment(seq,pos,false);
118                                            break;
119                                    case(10):
120                                    case(11):
121                                    case(12):
122                                    case(23):
123                                    case(24):
124                                    case(25):
125                                    case(26):
126                                    case(27):
127                                    case(28):                                       
128                                            intron.increment(seq,pos,false);
129                                            break;
130                                    default:
131                                            Assert.a(false);
132                                    }       
133                            }
134                    }
135                    
136                    for (int j=0; j<3; j++) {
137                            exonic[j].finalize();
138                    }
139                    intron.finalize();
140                    intergenic.finalize();
141            }
142    
143            
144            public void evaluateNode(InputSequence<? extends Character> seq, int pos, int state, FeatureList result) {
145                    double evaluation=0;
146    
147                    int indexOffset = Integer.MIN_VALUE;
148                    int phase;
149                    switch(state) {
150                    case(0):
151                    case(13):
152                    case(14):       
153                    case(21):
154                    case(22):                       
155                            evaluation = intergenic.lookup(seq,pos,true);
156                            indexOffset = 0;
157                            break;
158                    case(1):
159                    case(2):
160                    case(3):
161                            phase = ((pos-state+1)%3+3)%3;
162                            evaluation = exonic[phase].lookup(seq,pos,true);
163                            indexOffset = 1;// + phase;
164                            break;
165                    case(4):
166                    case(5):
167                    case(6):
168                    case(15):
169                    case(16):
170                    case(17):
171                    case(18):
172                    case(19):
173                    case(20):                       
174                            evaluation = intron.lookup(seq,pos,true);
175                            indexOffset = 2;
176                            break;
177                    case(7):
178                    case(8):
179                    case(9):
180                            phase = ((-pos+state+1)%3+3)%3;
181                            evaluation = exonic[phase].lookup(seq,pos,false);
182                            indexOffset = 3;// + phase;
183                            break;
184                    case(10):
185                    case(11):
186                    case(12):
187                    case(23):
188                    case(24):
189                    case(25):
190                    case(26):
191                    case(27):
192                    case(28):                       
193                            evaluation = intron.lookup(seq,pos,false);
194                            indexOffset = 4;
195                            break;          
196                    default:
197                            Assert.a(false);
198                    }
199                    
200                    result.addFeature(startIx + (multipleFeatures ? indexOffset : 0), evaluation);          
201            }
202    
203            /** if true, a separate feature index is used for each state, creating 21 weights instead of 1.
204             * @return returns true if a separate feature index is used for each state
205             */
206            public boolean isMultipleFeatures() {
207                    return multipleFeatures;
208            }
209    
210            /**
211             * @param multipleFeatures The multipleFeatures to set.
212             */
213            public void setMultipleFeatures(boolean weightPerState) {
214                    this.multipleFeatures = weightPerState;
215            }
216    
217            
218    }