001 package calhoun.analysis.crf.features.interval13;
002
003 import java.util.List;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.AbstractFeatureManager;
009 import calhoun.analysis.crf.CacheStrategySpec;
010 import calhoun.analysis.crf.FeatureList;
011 import calhoun.analysis.crf.FeatureManagerEdge;
012 import calhoun.analysis.crf.ModelManager;
013 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014 import calhoun.analysis.crf.io.InputSequence;
015 import calhoun.analysis.crf.io.TrainingSequence;
016 import calhoun.util.Assert;
017
018
019 public class StateTransitionsInterval13 extends AbstractFeatureManager<Character> implements FeatureManagerEdge<Character> {
020 private static final long serialVersionUID = -7745853849576425025L;
021 private static final Log log = LogFactory.getLog(StateTransitionsInterval13.class);
022
023 private int startIx;
024 private double intronProb;
025 private double endProb;
026
027 public String getFeatureName(int featureIndex) {
028 return "State transitions";
029 }
030
031 public int getNumFeatures() {
032 return 1;
033 }
034
035 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
036 Interval13Tools.verify(modelInfo);
037 startIx = startingIndex;
038
039 // Get the average # of exons per gene from training data
040 int intronCount = 0;
041 int geneCount = 0;
042 for(TrainingSequence<?> seq : data) {
043 int[] y = seq.getY();
044 int prevState = y[0];
045 for(int i=1; i<y.length; ++i) {
046 int state = y[i];
047 switch(Interval13Tools.edgeConstraints[prevState*Interval13Tools.numStates + state]) {
048 case PDON:
049 case MACC:
050 ++intronCount;
051 break;
052 case PSTOP:
053 case MSTART:
054 ++geneCount;
055 break;
056 default:
057 }
058 prevState = state;
059 }
060 }
061
062 double avgExonCount = (intronCount+geneCount)/((float)geneCount);
063 endProb = Math.log(1/avgExonCount);
064 intronProb = Math.log(1 - 1/avgExonCount);
065 log.warn(String.format("%d genes, %d introns, %.2f exons/gene", geneCount, intronCount, avgExonCount));
066 }
067
068 public void evaluateEdge(InputSequence<? extends Character> seq, int pos, int prevState, int state, FeatureList result) {
069
070 // There's really only one parameter below: the average number of exons in a gene.
071
072 switch(Interval13Tools.edgeConstraints[prevState*Interval13Tools.numStates + state]) {
073 case NONE:
074 case PACC:
075 case MDON:
076 case PCODE: // redundant with node invalidation below
077 case MCODE: // redundant iwth node evaluation below
078 break;
079 case PSTART:
080 case MSTOP:
081 result.addFeature(startIx,Math.log(0.5));
082 break;
083 case PDON:
084 case MACC:
085 result.addFeature(startIx, intronProb);
086 break;
087 case PSTOP:
088 case MSTART:
089 result.addFeature(startIx, endProb);
090 break;
091 case NEVER:
092 default:
093 Assert.a(false);
094 }
095 }
096
097
098 @Override
099 public CacheStrategySpec getCacheStrategy() {
100 return new CacheStrategySpec(CacheStrategy.CONSTANT);
101 }
102
103 /**
104 * @return Returns the endProb.
105 */
106 public double getEndProb() {
107 return endProb;
108 }
109
110 /**
111 * @return Returns the intronProb.
112 */
113 public double getIntronProb() {
114 return intronProb;
115 }
116
117 }