001 package calhoun.analysis.crf.solver;
002
003 import java.util.Arrays;
004 import java.util.List;
005
006 import org.apache.commons.logging.Log;
007 import org.apache.commons.logging.LogFactory;
008
009 import riso.numerical.LBFGS;
010 import calhoun.analysis.crf.CRFObjectiveFunctionGradient;
011 import calhoun.analysis.crf.CRFTraining;
012 import calhoun.analysis.crf.ModelManager;
013 import calhoun.analysis.crf.io.TrainingSequence;
014 import calhoun.util.Assert;
015 import calhoun.util.ErrorException;
016
017 /** uses a L-BFGS algorithm to optimize the objective function. This is the algorithm normally used.
018 * Several configuration parameters all you to control optimization parameters:
019 *
020 * <p>
021 * This optimizer requires one property to be set:
022 * <p>
023 * <b><code>gradFunc</code></b> - This is the gradient function that the optimizer will use. It must be a reference to a bean that implements the {@link CRFObjectiveFunctionGradient} interface.
024 * <p>
025 * In addition, it has several optional properties that allow control over the optimization process:
026 * <ul>
027 * <li> <b><code>debugLevel</code></b> - explicitly sets the debug level in the underlying L-BFGS solver. Normally it is set based
028 * on the debug level for this object.
029 * <li> <b><code>epsForConvergence</code></b> - epsilon value used to determine when to halt the optimzation and declase convergence. Defaults
030 * to 0.0001.
031 * <li> <b><code>fixFirstWeight</code></b> - if true, the first feature weight will be fixed at 1.0 and will not be allowed to change
032 * during the optimization
033 * <li> <b><code>maxIters</code></b> - The maximum number of iterations (objective function evaluations) to attempt
034 * <li> <b><code>mForHessian</code></b> - sets the <code>mForHession</code> parameter in the underlying L-BFGS solver. Defaults to 20.
035 * <li> <b><code>quadraticRegularization</code></b> - if set to a nonzero value, regularizes feature weights by imposing
036 * a penalty on the objective function based on the absolute sizes of the weights. Can help in cases where weights
037 * get very large.
038 * <li> <b><code>requireConverge</code></b> - if true, throws an error if convergence is not reached in the maximum number
039 * of iterations. Otherwise, the current feature weights are returned when <code>maxIters</code> is reached.
040 * <li> <b><code>starts</code></b> - an initial set of guesses at feature weights. Defaults to 1.0
041 * <li> <b><code>unchangedObjective</code></b> - the number of times the objectiveFunction must return an identical value before the optimization is assumed to have converged.
042 * This protects against a situation where the epsForConvergence is set low enough that even the smallest possible changes in weights can't find an optimum. Defaults to 5
043 * </ul>
044 * */
045 public class StandardOptimizer implements CRFTraining {
046 private static final Log log = LogFactory.getLog(StandardOptimizer.class);
047 boolean debug = log.isDebugEnabled();
048
049 // Configuration
050 CRFObjectiveFunctionGradient gradFunc;
051 int maxIters = 2000;
052 int mForHessian = 20;
053 int debugLevel = 0;
054 boolean requireConvergence = true;
055 double epsForConvergence = 0.00001;
056 double[] starts = null;
057 boolean fixFirstWeight = false;
058 double quadraticRegularization = 0.0;
059 int unchangedObjective = 5;
060
061 public double[] optimize(ModelManager fm, List<? extends TrainingSequence<?>> data) {
062 gradFunc.setTrainingData(fm, data);
063 int nFeatures = fm.getNumFeatures();
064 if (requireConvergence) {
065 log.info("NOTE: You ARE requiring convergence of LBFGS");
066 } else {
067 log.warn("NOTE: You ARE NOT requiring convergence of LBFGS");
068 }
069
070 double f = Float.NaN, xtol = 1.0e-16; // machine precision
071 int iprint[] = new int[2];
072 iprint[0] = debugLevel - 2;
073 iprint[1] = debugLevel - 1;
074 int[] iflag = new int[] {0};
075 int numFeatures = nFeatures;
076
077 double[] lambda = new double[nFeatures];
078 double[] grad = new double[nFeatures];
079
080 if(fixFirstWeight) {
081 log.info("Fixing first weight to 1.0.");
082 numFeatures -= 1;
083 }
084
085 double[] optLambda = new double[numFeatures];
086 double[] optGrad = new double[numFeatures];
087 double[] diag = new double[numFeatures]; // needed by the optimizer
088
089 if(starts == null) {
090 Arrays.fill(lambda, 1.0);
091 }
092 else {
093 Assert.a(starts.length == lambda.length, "Received ", starts.length, " initial weights. Expected: ", lambda.length);
094 System.arraycopy(starts, 0, lambda, 0, lambda.length);
095 }
096
097 int icall = 0;
098 float lastObjective = Float.NaN;
099 int runningObjective = 0;
100 do {
101 if(fixFirstWeight) {
102 lambda[0] = 1.0;
103 }
104
105 try {
106 f = gradFunc.apply(lambda, grad);
107 }
108 catch(RuntimeException ex) {
109 if(requireConvergence) {
110 throw ex;
111 } else {
112 log.warn("Exception thrown while calculating gradient. Possible numeric problem.");
113 log.warn(ex);
114 return lambda;
115 }
116 }
117
118 // Take the gradient, normalize by the total length of the sequence, and make it a minimization instead of a maximization problem.
119 //f = -f;
120 f = -f;
121 if(fixFirstWeight) {
122 for (int j = 1; j < lambda.length; j++) {
123 optGrad[j-1] = -grad[j];
124 //optGrad[j-1] = -grad[j];
125 }
126 for (int j = 1; j < lambda.length; j++) {
127 optLambda[j-1] = lambda[j];
128 }
129 }
130 else {
131 for (int j = 0; j < lambda.length; j++) {
132 optGrad[j] = -grad[j];
133 //optGrad[j] = -grad[j];
134 }
135 for (int j = 0; j < lambda.length; j++) {
136 optLambda[j] = lambda[j];
137 }
138 }
139 // Add a regularization term quadraticRegularization*sum(lambda_i^2)
140 for (int j=0; j<lambda.length; j++) {
141 f += quadraticRegularization*lambda[j]*lambda[j];
142 optGrad[j] += 2*quadraticRegularization*lambda[j];
143 }
144
145 // Check for an objective value that hasn't changed over several iterations.
146 if(((float) f) == lastObjective) {
147 ++runningObjective;
148 log.info("Objective value unchanged: "+lastObjective+" returned "+runningObjective+" times.");
149 if(runningObjective > 0 && runningObjective >= unchangedObjective) {
150 log.warn("Same objective value: "+lastObjective+" returned "+(unchangedObjective+1)+" times. Assuming convergence.");
151 return lambda;
152 }
153 }
154 else {
155 runningObjective = 0;
156 lastObjective = (float) f;
157 }
158
159 try {
160 LBFGS.gtol = 0.1;
161 LBFGS.lbfgs(numFeatures, mForHessian, optLambda, f, optGrad, false, diag, iprint, epsForConvergence, xtol, iflag);
162 } catch (LBFGS.ExceptionWithIflag e) {
163 if(requireConvergence)
164 throw new ErrorException("lbfgs failed", e);
165 else {
166 log.warn("Lbfgs failed, proceeding anyway", e);
167 break;
168 }
169 }
170 if(fixFirstWeight) {
171 for (int j = 1; j < lambda.length; j++) {
172 lambda[j] = optLambda[j-1];
173 }
174 }
175 else {
176 for (int j = 0; j < lambda.length; j++) {
177 lambda[j] = optLambda[j];
178 }
179 }
180 icall += 1;
181 } while ((iflag[0] != 0) && (icall < maxIters));
182 if(requireConvergence && !(iflag[0] == 0)) {
183 throw new ErrorException("Convergence not reached.");
184 }
185 return lambda;
186 }
187
188 /** returns the configured objective function gradient which will be
189 * used by the optimizer during the training process.
190 * @return the configured objective function gradient
191 */
192 public CRFObjectiveFunctionGradient getObjectiveFunction() {
193 return gradFunc;
194 }
195
196 /** sets the objective function gradient. Called automatically during configuration. */
197 public void setObjectiveFunction(CRFObjectiveFunctionGradient objectiveFunction) {
198 this.gradFunc = objectiveFunction;
199 }
200
201 public int getDebugLevel() {
202 return debugLevel;
203 }
204
205 public void setDebugLevel(int debugLevel) {
206 this.debugLevel = debugLevel;
207 }
208
209 public double getEpsForConvergence() {
210 return epsForConvergence;
211 }
212
213 public void setEpsForConvergence(double epsForConvergence) {
214 this.epsForConvergence = epsForConvergence;
215 }
216
217 public boolean isFixFirstWeight() {
218 return fixFirstWeight;
219 }
220
221 public void setFixFirstWeight(boolean fixFirstWeight) {
222 this.fixFirstWeight = fixFirstWeight;
223 }
224
225 public int getMaxIters() {
226 return maxIters;
227 }
228
229 public void setMaxIters(int maxIters) {
230 this.maxIters = maxIters;
231 }
232
233 public int getMForHessian() {
234 return mForHessian;
235 }
236
237 public void setMForHessian(int forHessian) {
238 mForHessian = forHessian;
239 }
240
241 public double getQuadraticRegularization() {
242 return quadraticRegularization;
243 }
244
245 public void setQuadraticRegularization(double quadraticRegularization) {
246 this.quadraticRegularization = quadraticRegularization;
247 }
248
249 public boolean isRequireConvergence() {
250 return requireConvergence;
251 }
252
253 public void setRequireConvergence(boolean requireConvergence) {
254 this.requireConvergence = requireConvergence;
255 }
256
257 public double[] getStarts() {
258 return starts;
259 }
260
261 public void setStarts(double[] starts) {
262 this.starts = starts;
263 }
264
265 /**
266 * @return Returns the unchangedObjective.
267 */
268 public int getUnchangedObjective() {
269 return unchangedObjective;
270 }
271
272 /**
273 * @param unchangedObjective The unchangedObjective to set.
274 */
275 public void setUnchangedObjective(int unchangedObjective) {
276 this.unchangedObjective = unchangedObjective;
277 }
278 }