001 package calhoun.analysis.crf.features.interval29;
002
003 import java.util.List;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.AbstractFeatureManager;
009 import calhoun.analysis.crf.CacheStrategySpec;
010 import calhoun.analysis.crf.FeatureList;
011 import calhoun.analysis.crf.FeatureManagerEdge;
012 import calhoun.analysis.crf.ModelManager;
013 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014 import calhoun.analysis.crf.io.InputSequence;
015 import calhoun.analysis.crf.io.TrainingSequence;
016 import calhoun.util.Assert;
017
018
019 public class StateTransitionsInterval29 extends AbstractFeatureManager<Character> implements FeatureManagerEdge<Character> {
020 private static final long serialVersionUID = -7745853849576425025L;
021 private static final Log log = LogFactory.getLog(StateTransitionsInterval29.class);
022
023 private int startIx;
024 private double intronProb;
025 private double endProb;
026
027 public String getFeatureName(int featureIndex) {
028 return "State transition log-probabilities for the model Interval29";
029 }
030
031 public int getNumFeatures() {
032 return 1;
033 }
034
035 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
036 Interval29Tools.verify(modelInfo);
037 startIx = startingIndex;
038
039 // Get the average # of exons per gene from training data
040 int intronCount = 0;
041 int geneCount = 0;
042 for(TrainingSequence<?> seq : data) {
043 int[] y = seq.getY();
044 int prevState = y[0];
045 for(int i=1; i<y.length; ++i) {
046 int state = y[i];
047 switch(Interval29Tools.edgeConstraints[prevState*Interval29Tools.numStates + state]) {
048 case PDON:
049 case MACC:
050 ++intronCount;
051 break;
052 case PSTOP:
053 case MSTART:
054 ++geneCount;
055 break;
056 default:
057 }
058 prevState = state;
059 }
060 }
061
062 double avgExonCount = (intronCount+geneCount)/geneCount;
063 endProb = Math.log(1/avgExonCount);
064 intronProb = Math.log(1 - 1/avgExonCount);
065 log.info(String.format("%d genes, %d introns, %.2f exons/gene", geneCount, intronCount, avgExonCount));
066 }
067
068 public void evaluateEdge(InputSequence<? extends Character> seq, int pos, int prevState, int state, FeatureList result) {
069
070 // There's really only one parameter below: the average number of exons in a gene.
071
072 switch(Interval29Tools.edgeConstraints[prevState*Interval29Tools.numStates + state]) {
073 case NONE:
074 case PACC:
075 case MDON:
076 case PCODE: // redundant with node invalidation below
077 case MCODE: // redundant iwth node evaluation below
078 break;
079 case PSTART:
080 case MSTOP:
081 result.addFeature(startIx,Math.log(0.5));
082 break;
083 case PDON:
084 case MACC:
085 result.addFeature(startIx, intronProb);
086 break;
087 case PSTOP:
088 case MSTART:
089 result.addFeature(startIx, endProb);
090 break;
091 case NEVER:
092 case PKEEPE:
093 case PKEEPI:
094 case MKEEPE:
095 case MKEEPI:
096 case PSTOPPED:
097 case MSTARTED:
098 case PWILLSTART:
099 case MWILLSTOP:
100 break;
101 default:
102 Assert.a(false);
103 }
104 }
105
106
107 public CacheStrategySpec getCacheStrategy() {
108 return new CacheStrategySpec(CacheStrategy.CONSTANT);
109 }
110
111 /**
112 * @return Returns the endProb.
113 */
114 public double getEndProb() {
115 return endProb;
116 }
117
118 /**
119 * @return Returns the intronProb.
120 */
121 public double getIntronProb() {
122 return intronProb;
123 }
124
125 }