001    package calhoun.analysis.crf.solver;
002    
003    import java.util.Arrays;
004    import java.util.List;
005    
006    import org.apache.commons.logging.Log;
007    import org.apache.commons.logging.LogFactory;
008    
009    import riso.numerical.LBFGS;
010    import calhoun.analysis.crf.CRFObjectiveFunctionGradient;
011    import calhoun.analysis.crf.CRFTraining;
012    import calhoun.analysis.crf.ModelManager;
013    import calhoun.analysis.crf.io.TrainingSequence;
014    import calhoun.util.Assert;
015    import calhoun.util.ErrorException;
016    
017    /** uses a L-BFGS algorithm to optimize the objective function.  This is the algorithm normally used. 
018     * Several configuration parameters all you to control optimization parameters:
019     * 
020     * <p>
021     * This optimizer requires one property to be set:
022     * <p>
023     * <b><code>gradFunc</code></b> - This is the gradient function that the optimizer will use.  It must be a reference to a bean that implements the {@link CRFObjectiveFunctionGradient} interface. 
024     * <p>
025     * In addition, it has several optional properties that allow control over the optimization process:
026     * <ul>
027     * <li> <b><code>debugLevel</code></b> - explicitly sets the debug level in the underlying L-BFGS solver.  Normally it is set based
028     * on the debug level for this object. 
029     * <li> <b><code>epsForConvergence</code></b> - epsilon value used to determine when to halt the optimzation and declase convergence.  Defaults
030     * to 0.0001.
031     * <li> <b><code>fixFirstWeight</code></b> - if true, the first feature weight will be fixed at 1.0 and will not be allowed to change 
032     * during the optimization
033     * <li> <b><code>maxIters</code></b> - The maximum number of iterations (objective function evaluations) to attempt
034     * <li> <b><code>mForHessian</code></b> - sets the <code>mForHession</code> parameter in the underlying L-BFGS solver.  Defaults to 20.
035     * <li> <b><code>quadraticRegularization</code></b> - if set to a nonzero value, regularizes feature weights by imposing 
036     * a penalty on the objective function based on the absolute sizes of the weights.  Can help in cases where weights
037     * get very large.
038     * <li> <b><code>requireConverge</code></b> - if true, throws an error if convergence is not reached in the maximum number 
039     * of iterations.  Otherwise, the current feature weights are returned when <code>maxIters</code> is reached.
040     * <li> <b><code>starts</code></b> - an initial set of guesses at feature weights.  Defaults to 1.0
041     * <li> <b><code>unchangedObjective</code></b> - the number of times the objectiveFunction must return an identical value before the optimization is assumed to have converged.  
042     *   This protects against a situation where the epsForConvergence is set low enough that even the smallest possible changes in weights can't find an optimum.  Defaults to 5
043     * </ul>
044     * */
045    public class StandardOptimizer implements CRFTraining {
046            private static final Log log = LogFactory.getLog(StandardOptimizer.class);
047            boolean debug = log.isDebugEnabled();
048            
049            // Configuration
050            CRFObjectiveFunctionGradient gradFunc;
051            int maxIters = 2000;
052            int mForHessian = 20;
053            int debugLevel = 0;
054            boolean requireConvergence = true;
055            double epsForConvergence = 0.00001;
056            double[] starts = null;
057            boolean fixFirstWeight = false;
058            double quadraticRegularization = 0.0;
059            int unchangedObjective = 5;
060            
061            public double[] optimize(ModelManager fm, List<? extends TrainingSequence<?>> data) {
062                    gradFunc.setTrainingData(fm, data);
063                    int nFeatures = fm.getNumFeatures();
064                    if (requireConvergence) {
065                            log.info("NOTE: You ARE requiring convergence of LBFGS");
066                    } else {
067                            log.warn("NOTE: You ARE NOT requiring convergence of LBFGS");                   
068                    }
069    
070                    double f = Float.NaN, xtol = 1.0e-16; // machine precision
071                    int iprint[] = new int[2];
072                    iprint[0] = debugLevel - 2;
073                    iprint[1] = debugLevel - 1;
074                    int[] iflag = new int[] {0};
075                    int numFeatures = nFeatures;
076    
077                    double[] lambda = new double[nFeatures];
078                    double[] grad = new double[nFeatures];
079    
080                    if(fixFirstWeight) {
081                            log.info("Fixing first weight to 1.0.");
082                            numFeatures -= 1;
083                    }
084                    
085                    double[] optLambda = new double[numFeatures];
086                    double[] optGrad = new double[numFeatures];
087                    double[] diag = new double[numFeatures]; // needed by the optimizer
088    
089                    if(starts == null) {
090                            Arrays.fill(lambda, 1.0);
091                    }
092                    else {
093                            Assert.a(starts.length == lambda.length, "Received ", starts.length, " initial weights.  Expected: ", lambda.length);
094                            System.arraycopy(starts, 0, lambda, 0, lambda.length);
095                    }
096    
097                    int icall = 0;
098                    float lastObjective = Float.NaN;
099                    int runningObjective = 0;
100                    do {
101                            if(fixFirstWeight) {
102                                    lambda[0] = 1.0;
103                            }
104                            
105                            try {
106                                    f = gradFunc.apply(lambda, grad);
107                            }
108                            catch(RuntimeException ex) {
109                                    if(requireConvergence) {
110                                            throw ex;
111                                    } else {
112                                            log.warn("Exception thrown while calculating gradient.  Possible numeric problem.");
113                                            log.warn(ex);
114                                            return lambda;
115                                    }
116                            }
117    
118                            // Take the gradient, normalize by the total length of the sequence, and make it a minimization instead of a maximization problem.
119                            //f = -f; 
120                            f = -f; 
121                            if(fixFirstWeight) {
122                                    for (int j = 1; j < lambda.length; j++) {
123                                            optGrad[j-1] = -grad[j];
124                                            //optGrad[j-1] = -grad[j];
125                                    }
126                                    for (int j = 1; j < lambda.length; j++) {
127                                            optLambda[j-1] = lambda[j];
128                                    }
129                            }
130                            else {
131                                    for (int j = 0; j < lambda.length; j++) {
132                                            optGrad[j] = -grad[j];
133                                            //optGrad[j] = -grad[j];
134                                    }
135                                    for (int j = 0; j < lambda.length; j++) {
136                                            optLambda[j] = lambda[j];
137                                    }
138                            }
139                            // Add a regularization term quadraticRegularization*sum(lambda_i^2)
140                            for (int j=0; j<lambda.length; j++) {
141                                    f += quadraticRegularization*lambda[j]*lambda[j];
142                                    optGrad[j] += 2*quadraticRegularization*lambda[j];
143                            }
144                            
145                            // Check for an objective value that hasn't changed over several iterations.
146                            if(((float) f) == lastObjective) {
147                                    ++runningObjective;
148                                    log.info("Objective value unchanged: "+lastObjective+" returned "+runningObjective+" times.");
149                                    if(runningObjective > 0 && runningObjective >= unchangedObjective) {
150                                            log.warn("Same objective value: "+lastObjective+" returned "+(unchangedObjective+1)+" times.  Assuming convergence.");
151                                            return lambda;
152                                    }
153                            }
154                            else {
155                                    runningObjective = 0;
156                                    lastObjective = (float) f;
157                            }
158                            
159                            try {
160                                    LBFGS.gtol = 0.1;
161                                    LBFGS.lbfgs(numFeatures, mForHessian, optLambda, f, optGrad, false, diag, iprint, epsForConvergence, xtol, iflag);
162                            } catch (LBFGS.ExceptionWithIflag e) {
163                                    if(requireConvergence)
164                                            throw new ErrorException("lbfgs failed", e);
165                                    else {
166                                            log.warn("Lbfgs failed, proceeding anyway", e);
167                                            break;
168                                    }
169                            }
170                            if(fixFirstWeight) {
171                                    for (int j = 1; j < lambda.length; j++) {
172                                            lambda[j] = optLambda[j-1];
173                                    }
174                            }
175                            else {
176                                    for (int j = 0; j < lambda.length; j++) {
177                                            lambda[j] = optLambda[j];
178                                    }
179                            }
180                            icall += 1;
181                    } while ((iflag[0] != 0) && (icall < maxIters));
182                    if(requireConvergence && !(iflag[0] == 0)) {
183                            throw new ErrorException("Convergence not reached.");
184                    }
185                    return lambda;
186            }
187    
188            /** returns the configured objective function gradient which will be 
189             * used by the optimizer during the training process.
190             * @return the configured objective function gradient
191             */
192            public CRFObjectiveFunctionGradient getObjectiveFunction() {
193                    return gradFunc;
194            }
195            
196            /** sets the objective function gradient.  Called automatically during configuration. */
197            public void setObjectiveFunction(CRFObjectiveFunctionGradient objectiveFunction) {
198                    this.gradFunc = objectiveFunction;
199            }
200    
201            public int getDebugLevel() {
202                    return debugLevel;
203            }
204    
205            public void setDebugLevel(int debugLevel) {
206                    this.debugLevel = debugLevel;
207            }
208    
209            public double getEpsForConvergence() {
210                    return epsForConvergence;
211            }
212    
213            public void setEpsForConvergence(double epsForConvergence) {
214                    this.epsForConvergence = epsForConvergence;
215            }
216    
217            public boolean isFixFirstWeight() {
218                    return fixFirstWeight;
219            }
220    
221            public void setFixFirstWeight(boolean fixFirstWeight) {
222                    this.fixFirstWeight = fixFirstWeight;
223            }
224    
225            public int getMaxIters() {
226                    return maxIters;
227            }
228    
229            public void setMaxIters(int maxIters) {
230                    this.maxIters = maxIters;
231            }
232    
233            public int getMForHessian() {
234                    return mForHessian;
235            }
236    
237            public void setMForHessian(int forHessian) {
238                    mForHessian = forHessian;
239            }
240    
241            public double getQuadraticRegularization() {
242                    return quadraticRegularization;
243            }
244    
245            public void setQuadraticRegularization(double quadraticRegularization) {
246                    this.quadraticRegularization = quadraticRegularization;
247            }
248    
249            public boolean isRequireConvergence() {
250                    return requireConvergence;
251            }
252    
253            public void setRequireConvergence(boolean requireConvergence) {
254                    this.requireConvergence = requireConvergence;
255            }
256    
257            public double[] getStarts() {
258                    return starts;
259            }
260    
261            public void setStarts(double[] starts) {
262                    this.starts = starts;
263            }
264    
265            /**
266             * @return Returns the unchangedObjective.
267             */
268            public int getUnchangedObjective() {
269                    return unchangedObjective;
270            }
271    
272            /**
273             * @param unchangedObjective The unchangedObjective to set.
274             */
275            public void setUnchangedObjective(int unchangedObjective) {
276                    this.unchangedObjective = unchangedObjective;
277            }
278    }