001    package calhoun.analysis.crf.features.interval29;
002    
003    import java.util.List;
004    
005    import org.apache.commons.logging.Log;
006    import org.apache.commons.logging.LogFactory;
007    
008    import calhoun.analysis.crf.AbstractFeatureManager;
009    import calhoun.analysis.crf.CacheStrategySpec;
010    import calhoun.analysis.crf.FeatureList;
011    import calhoun.analysis.crf.FeatureManagerEdge;
012    import calhoun.analysis.crf.ModelManager;
013    import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014    import calhoun.analysis.crf.io.InputSequence;
015    import calhoun.analysis.crf.io.TrainingSequence;
016    import calhoun.util.Assert;
017    
018    
019    public class StateTransitionsInterval29 extends AbstractFeatureManager<Character> implements FeatureManagerEdge<Character> {
020            private static final long serialVersionUID = -7745853849576425025L;
021            private static final Log log = LogFactory.getLog(StateTransitionsInterval29.class);
022    
023            private int startIx;
024            private double intronProb;
025            private double endProb;
026            
027            public String getFeatureName(int featureIndex) {
028                    return "State transition log-probabilities for the model Interval29";
029            }
030    
031            public int getNumFeatures() {
032                    return 1;
033            }
034    
035            public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
036                    Interval29Tools.verify(modelInfo);
037                    startIx = startingIndex;
038                    
039                    // Get the average # of exons per gene from training data
040                    int intronCount = 0;
041                    int geneCount = 0;
042                    for(TrainingSequence<?> seq : data) {
043                            int[] y = seq.getY();
044                            int prevState = y[0];
045                            for(int i=1; i<y.length; ++i) {
046                                    int state = y[i];
047                                    switch(Interval29Tools.edgeConstraints[prevState*Interval29Tools.numStates + state]) {
048                                            case PDON:
049                                            case MACC:
050                                                    ++intronCount;
051                                                    break;
052                                            case PSTOP:
053                                            case MSTART:
054                                                    ++geneCount;
055                                                    break;
056                                            default:
057                                    }
058                                    prevState = state;
059                            }
060                    }
061                    
062                    double avgExonCount = (intronCount+geneCount)/geneCount;
063                    endProb = Math.log(1/avgExonCount);
064                    intronProb = Math.log(1 - 1/avgExonCount);
065                    log.info(String.format("%d genes, %d introns, %.2f exons/gene", geneCount, intronCount, avgExonCount));
066            }
067            
068            public void evaluateEdge(InputSequence<? extends Character> seq, int pos, int prevState, int state, FeatureList result) {
069            
070                    // There's really only one parameter below: the average number of exons in a gene.
071                    
072                    switch(Interval29Tools.edgeConstraints[prevState*Interval29Tools.numStates + state]) {
073                            case NONE:
074                            case PACC:
075                            case MDON:
076                            case PCODE: // redundant with node invalidation below
077                            case MCODE: // redundant iwth node evaluation below
078                                    break;
079                            case PSTART:
080                            case MSTOP:
081                                    result.addFeature(startIx,Math.log(0.5));
082                                    break;
083                            case PDON:
084                            case MACC:
085                                    result.addFeature(startIx, intronProb);
086                                    break;
087                            case PSTOP:
088                            case MSTART:
089                                    result.addFeature(startIx, endProb);
090                                    break;
091                            case NEVER:
092                            case PKEEPE:
093                            case PKEEPI:
094                            case MKEEPE:
095                            case MKEEPI:
096                            case PSTOPPED:
097                            case MSTARTED:
098                            case PWILLSTART:
099                            case MWILLSTOP:                         
100                                    break;                          
101                            default:
102                                    Assert.a(false);
103                    }
104            }
105    
106            
107            public CacheStrategySpec getCacheStrategy() {
108                    return new CacheStrategySpec(CacheStrategy.CONSTANT);
109            }
110    
111            /**
112             * @return Returns the endProb.
113             */
114            public double getEndProb() {
115                    return endProb;
116            }
117    
118            /**
119             * @return Returns the intronProb.
120             */
121            public double getIntronProb() {
122                    return intronProb;
123            }
124    
125    }