001    package calhoun.analysis.crf.features.interval13;
002    
003    import java.util.List;
004    
005    import org.apache.commons.logging.Log;
006    import org.apache.commons.logging.LogFactory;
007    
008    import calhoun.analysis.crf.AbstractFeatureManager;
009    import calhoun.analysis.crf.CacheStrategySpec;
010    import calhoun.analysis.crf.FeatureList;
011    import calhoun.analysis.crf.FeatureManagerEdge;
012    import calhoun.analysis.crf.ModelManager;
013    import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014    import calhoun.analysis.crf.io.InputSequence;
015    import calhoun.analysis.crf.io.TrainingSequence;
016    import calhoun.util.Assert;
017    
018    
019    public class StateTransitionsInterval13 extends AbstractFeatureManager<Character> implements FeatureManagerEdge<Character> {
020            private static final long serialVersionUID = -7745853849576425025L;
021            private static final Log log = LogFactory.getLog(StateTransitionsInterval13.class);
022    
023            private int startIx;
024            private double intronProb;
025            private double endProb;
026            
027            public String getFeatureName(int featureIndex) {
028                    return "State transitions";
029            }
030    
031            public int getNumFeatures() {
032                    return 1;
033            }
034    
035            public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
036                    Interval13Tools.verify(modelInfo);
037                    startIx = startingIndex;
038                    
039                    // Get the average # of exons per gene from training data
040                    int intronCount = 0;
041                    int geneCount = 0;
042                    for(TrainingSequence<?> seq : data) {
043                            int[] y = seq.getY();
044                            int prevState = y[0];
045                            for(int i=1; i<y.length; ++i) {
046                                    int state = y[i];
047                                    switch(Interval13Tools.edgeConstraints[prevState*Interval13Tools.numStates + state]) {
048                                            case PDON:
049                                            case MACC:
050                                                    ++intronCount;
051                                                    break;
052                                            case PSTOP:
053                                            case MSTART:
054                                                    ++geneCount;
055                                                    break;
056                                            default:
057                                    }
058                                    prevState = state;
059                            }
060                    }
061                    
062                    double avgExonCount = (intronCount+geneCount)/((float)geneCount);
063                    endProb = Math.log(1/avgExonCount);
064                    intronProb = Math.log(1 - 1/avgExonCount);
065                    log.warn(String.format("%d genes, %d introns, %.2f exons/gene", geneCount, intronCount, avgExonCount));
066            }
067            
068            public void evaluateEdge(InputSequence<? extends Character> seq, int pos, int prevState, int state, FeatureList result) {
069            
070                    // There's really only one parameter below: the average number of exons in a gene.
071                    
072                    switch(Interval13Tools.edgeConstraints[prevState*Interval13Tools.numStates + state]) {
073                            case NONE:
074                            case PACC:
075                            case MDON:
076                            case PCODE: // redundant with node invalidation below
077                            case MCODE: // redundant iwth node evaluation below
078                                    break;
079                            case PSTART:
080                            case MSTOP:
081                                    result.addFeature(startIx,Math.log(0.5));
082                                    break;
083                            case PDON:
084                            case MACC:
085                                    result.addFeature(startIx, intronProb);
086                                    break;
087                            case PSTOP:
088                            case MSTART:
089                                    result.addFeature(startIx, endProb);
090                                    break;
091                            case NEVER:
092                            default:
093                                    Assert.a(false);
094                    }
095            }
096    
097            
098            @Override
099            public CacheStrategySpec getCacheStrategy() {
100                    return new CacheStrategySpec(CacheStrategy.CONSTANT);
101            }
102    
103            /**
104             * @return Returns the endProb.
105             */
106            public double getEndProb() {
107                    return endProb;
108            }
109    
110            /**
111             * @return Returns the intronProb.
112             */
113            public double getIntronProb() {
114                    return intronProb;
115            }
116    
117    }