001 package calhoun.analysis.crf.features.interval13;
002
003 import java.util.ArrayList;
004 import java.util.List;
005
006 import org.apache.commons.logging.Log;
007 import org.apache.commons.logging.LogFactory;
008
009 import calhoun.analysis.crf.CacheStrategySpec;
010 import calhoun.analysis.crf.FeatureList;
011 import calhoun.analysis.crf.FeatureManagerNodeExplicitLength;
012 import calhoun.analysis.crf.ModelManager;
013 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014 import calhoun.analysis.crf.io.InputSequence;
015 import calhoun.analysis.crf.io.SequenceConverter;
016 import calhoun.analysis.crf.io.TrainingSequence;
017 import calhoun.analysis.crf.statistics.MixtureOfGammas;
018 import calhoun.util.Assert;
019
020 public class StateLengthLogprobInterval13 implements FeatureManagerNodeExplicitLength<Character> {
021 private static final long serialVersionUID = 8685543199212865835L;
022
023 /* Returns the log probability of a state having the specified duration
024 * returned value depends only on state and length
025 *
026 * trained using mixture of gammas model, with a few tweaks:
027 * a) if all values identical (eg always have exactly 200 intergenic bases in training examples), then model as exponential length
028 * b) Otherwise introduce priors at 90 and 110% of median and at 95% and 105% of median, and train mixture of gammas using EM.
029 * This prevents convergence to a case where one of the two components is a single data point and likelihood is infinite.
030 *
031 * Some notes on normalization: (see also mixGamma):
032 * If you have a given state and evaluate these probabilities (before taking logs) and add up over all possible lengths,
033 * the result should be approximately one, but not exactly. Two reasons for discrepancy:
034 * a) mixGamma, integrated from to to infinity, should add up to 1.0. But summing at 1,2,3,4,... is a discrete
035 * approximation and might not agree exactly.
036 * b) The summation isn't from 1,2,3,4,... but from minLength[state] to maxLength[state]. However, this feature does not
037 * have access to that min/max length information, so normalization for this reason must happen downstream.
038 */
039 private static final Log log = LogFactory.getLog(StateLengthLogprobInterval13.class);
040
041 private int startIx;
042 ModelManager mi;
043 private String inputComponentName;
044
045 MixtureOfGammas intergenicMixGamma;
046 MixtureOfGammas exonMixGamma;
047 MixtureOfGammas intronMixGamma;
048
049 private boolean forceExponential = false;
050 private boolean exonExponential = false;
051 private boolean intronExponential = false;
052 private boolean multipleFeatures = false;
053 private boolean noIntergenic = false;
054
055 public void setForceExponential(boolean forceExponential) {
056 this.forceExponential = forceExponential;
057 }
058
059 public void setExonExponential(boolean exonExponential) {
060 this.exonExponential = exonExponential;
061 }
062
063 public void setIntronExponential(boolean intronExponential) {
064 this.intronExponential = intronExponential;
065 }
066
067 public void evaluateNodeLength(InputSequence<? extends Character> seq, int pos, int length, int state, FeatureList result) {
068 Assert.a(length>0);
069
070 MixtureOfGammas mg = null;
071 int indexOffset = Integer.MIN_VALUE;
072 switch (state) {
073 case (0):
074 if(noIntergenic)
075 return;
076 indexOffset = 0;
077 mg = intergenicMixGamma;
078 break;
079 case(1):
080 case(2):
081 case(3):
082 case(7):
083 case(8):
084 case(9):
085 indexOffset = 1;
086 mg = exonMixGamma;
087 break;
088 case(4):
089 case(5):
090 case(6):
091 case(10):
092 case(11):
093 case(12):
094 indexOffset = 2;
095 mg = intronMixGamma;
096 break;
097 default:
098 Assert.a(false);
099 }
100
101 double val = mg.logEvaluate((double) length);
102 Assert.a((val != Double.NEGATIVE_INFINITY) && (val != Double.POSITIVE_INFINITY) && (!Double.isNaN(val)));
103 if (val>0) {
104 log.warn("About to return a state length logprob evaluation that is greater than zero, see notes in source code.");
105 }
106 result.addFeature(startIx + (multipleFeatures ? indexOffset : 0),val);
107 }
108
109 public CacheStrategySpec getCacheStrategy() {
110 return new CacheStrategySpec(CacheStrategy.LENGTHFUNCTION);
111 }
112
113 public String getFeatureName(int featureIndex) {
114 if(multipleFeatures) {
115 String[] vals = new String[] { "Intergenic", "Exon", "Intron"};
116 String type = vals[featureIndex - startIx];
117 return type+" lengths";
118 }
119 else {
120 return "StateDurationLogProbForModelInterval13";
121 }
122 }
123
124 public String getInputComponent() {
125 return inputComponentName;
126 }
127
128 public void setInputComponent(String name) {
129 inputComponentName = name;
130 }
131
132 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
133 startIx = startingIndex;
134 mi = modelInfo;
135 Assert.a(mi.getNumStates()==13);
136
137 ArrayList<ArrayList<Integer>> stateDurations;
138
139 stateDurations = SequenceConverter.stateVector2StateLengths(data,mi.getNumStates());
140
141 List<Integer> exonLengths = new ArrayList<Integer>();
142 List<Integer> intronLengths = new ArrayList<Integer>();
143 List<Integer> intergenicLengths = new ArrayList<Integer>();
144
145 intergenicLengths.addAll(stateDurations.get(0));
146
147 exonLengths.addAll(stateDurations.get(1));
148 exonLengths.addAll(stateDurations.get(2));
149 exonLengths.addAll(stateDurations.get(3));
150 exonLengths.addAll(stateDurations.get(7));
151 exonLengths.addAll(stateDurations.get(8));
152 exonLengths.addAll(stateDurations.get(9));
153
154 intronLengths.addAll(stateDurations.get(4));
155 intronLengths.addAll(stateDurations.get(5));
156 intronLengths.addAll(stateDurations.get(6));
157 intronLengths.addAll(stateDurations.get(10));
158 intronLengths.addAll(stateDurations.get(11));
159 intronLengths.addAll(stateDurations.get(12));
160
161 double[] inter = new double[intergenicLengths.size()];
162 for (int j=0; j<intergenicLengths.size(); j++) {
163 inter[j] = (double) intergenicLengths.get(j);
164 }
165
166 double[] exon = new double[exonLengths.size()];
167 for (int j=0; j<exonLengths.size(); j++) {
168 exon[j] = (double) exonLengths.get(j);
169 }
170
171 double[] intron = new double[intronLengths.size()];
172 for (int j=0; j<intronLengths.size(); j++) {
173 intron[j] = (double) intronLengths.get(j);
174 }
175
176 if (forceExponential) {
177 intergenicMixGamma = new MixtureOfGammas(inter,true);
178 exonMixGamma = new MixtureOfGammas(exon,true);
179 intronMixGamma = new MixtureOfGammas(intron,true);
180 } else if (exonExponential) {
181 intergenicMixGamma = new MixtureOfGammas(inter,true);
182 exonMixGamma = new MixtureOfGammas(exon,true);
183 intronMixGamma = new MixtureOfGammas(intron);
184 } else if (intronExponential) {
185 intergenicMixGamma = new MixtureOfGammas(inter,true);
186 exonMixGamma = new MixtureOfGammas(exon);
187 intronMixGamma = new MixtureOfGammas(intron,true);
188 } else {
189 // by default, only intergenic regions modeled with exp length distributions
190 intergenicMixGamma = new MixtureOfGammas(inter,true);
191 exonMixGamma = new MixtureOfGammas(exon);
192 intronMixGamma = new MixtureOfGammas(intron);
193 }
194 }
195
196 public int getNumFeatures() {
197 return multipleFeatures ? 3 : 1;
198 }
199
200 /**
201 * @return Returns the multipleFeatures.
202 */
203 public boolean isMultipleFeatures() {
204 return multipleFeatures;
205 }
206
207 /** set to true to indicate that intergenic, exonic, and intergenic lengths should each get a separate weight.
208 * @param multipleFeatures The multipleFeatures to set.
209 */
210 public void setMultipleFeatures(boolean multipleFeatures) {
211 this.multipleFeatures = multipleFeatures;
212 }
213
214 /**
215 * @return Returns the noIntergenic.
216 */
217 public boolean isNoIntergenic() {
218 return noIntergenic;
219 }
220
221 /**
222 * @param noIntergenic The noIntergenic to set.
223 */
224 public void setNoIntergenic(boolean noIntergenic) {
225 this.noIntergenic = noIntergenic;
226 }
227 }