Stan  2.5.0
probability, sampling & optimization
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
bfgs.hpp
Go to the documentation of this file.
1 #ifndef STAN__OPTIMIZATION__BFGS_HPP
2 #define STAN__OPTIMIZATION__BFGS_HPP
3 
4 #include <cmath>
5 #include <cstdlib>
6 #include <string>
7 #include <limits>
8 
9 #include <boost/math/special_functions/fpclassify.hpp>
10 
13 #include <stan/model/util.hpp>
14 
18 
19 namespace stan {
20  namespace optimization {
21  typedef enum {
23  TERM_ABSX = 10,
24  TERM_ABSF = 20,
25  TERM_RELF = 21,
28  TERM_MAXIT = 40,
31 
32  template<typename Scalar = double>
34  public:
36  maxIts = 10000;
37  fScale = 1.0;
38 
39  tolAbsX = 1e-8;
40  tolAbsF = 1e-12;
41  tolAbsGrad = 1e-8;
42 
43  tolRelF = 1e+4;
44  tolRelGrad = 1e+3;
45  }
46  size_t maxIts;
47  Scalar tolAbsX;
48  Scalar tolAbsF;
49  Scalar tolRelF;
50  Scalar fScale;
51  Scalar tolAbsGrad;
52  Scalar tolRelGrad;
53  };
54 
55  template<typename Scalar = double>
56  class LSOptions {
57  public:
59  c1 = 1e-4;
60  c2 = 0.9;
61  minAlpha = 1e-12;
62  alpha0 = 1e-3;
63  }
64  Scalar c1;
65  Scalar c2;
66  Scalar alpha0;
67  Scalar minAlpha;
68  };
69 
70 
71  template<typename FunctorType, typename QNUpdateType,
72  typename Scalar = double, int DimAtCompile = Eigen::Dynamic>
73  class BFGSMinimizer {
74  public:
75  typedef Eigen::Matrix<Scalar,DimAtCompile,1> VectorT;
76  typedef Eigen::Matrix<Scalar,DimAtCompile,DimAtCompile> HessianT;
77 
78  protected:
79  FunctorType &_func;
81  Scalar _fk, _fk_1, _alphak_1;
82  Scalar _alpha, _alpha0;
83  size_t _itNum;
84  std::string _note;
85  QNUpdateType _qn;
86 
87  public:
90 
91  QNUpdateType &get_qnupdate() { return _qn; }
92  const QNUpdateType &get_qnupdate() const { return _qn; }
93 
94  const Scalar &curr_f() const { return _fk; }
95  const VectorT &curr_x() const { return _xk; }
96  const VectorT &curr_g() const { return _gk; }
97  const VectorT &curr_p() const { return _pk; }
98 
99  const Scalar &prev_f() const { return _fk_1; }
100  const VectorT &prev_x() const { return _xk_1; }
101  const VectorT &prev_g() const { return _gk_1; }
102  const VectorT &prev_p() const { return _pk_1; }
103  Scalar prev_step_size() const { return _pk_1.norm()*_alphak_1; }
104 
105  inline Scalar rel_grad_norm() const {
106  return -_pk.dot(_gk) / std::max(std::fabs(_fk),_conv_opts.fScale);
107  }
108  inline Scalar rel_obj_decrease() const {
109  return std::fabs(_fk_1 - _fk) / std::max(std::fabs(_fk_1),
111  _conv_opts.fScale));
112  }
113 
114  const Scalar &alpha0() const { return _alpha0; }
115  const Scalar &alpha() const { return _alpha; }
116  const size_t iter_num() const { return _itNum; }
117 
118  const std::string &note() const { return _note; }
119 
120  std::string get_code_string(int retCode) {
121  switch(retCode) {
122  case TERM_SUCCESS:
123  return std::string("Successful step completed");
124  case TERM_ABSF:
125  return std::string("Convergence detected: absolute change in objective function was below tolerance");
126  case TERM_RELF:
127  return std::string("Convergence detected: relative change in objective function was below tolerance");
128  case TERM_ABSGRAD:
129  return std::string("Convergence detected: gradient norm is below tolerance");
130  case TERM_RELGRAD:
131  return std::string("Convergence detected: relative gradient magnitude is below tolerance");
132  case TERM_ABSX:
133  return std::string("Convergence detected: absolute parameter change was below tolerance");
134  case TERM_MAXIT:
135  return std::string("Maximum number of iterations hit, may not be at an optima");
136  case TERM_LSFAIL:
137  return std::string("Line search failed to achieve a sufficient decrease, no more progress can be made");
138  default:
139  return std::string("Unknown termination code");
140  }
141  }
142 
143  BFGSMinimizer(FunctorType &f) : _func(f) { }
144 
145  void initialize(const VectorT &x0) {
146  int ret;
147  _xk = x0;
148  ret = _func(_xk,_fk,_gk);
149  if (ret) {
150  throw std::runtime_error("Error evaluating initial BFGS point.");
151  }
152  _pk = -_gk;
153 
154  _itNum = 0;
155  _note = "";
156  }
157 
158  int step() {
159  Scalar gradNorm, stepNorm;
160  VectorT sk, yk;
161  int retCode(0);
162  int resetB(0);
163 
164  _itNum++;
165 
166  if (_itNum == 1) {
167  resetB = 1;
168  _note = "";
169  }
170  else {
171  resetB = 0;
172  _note = "";
173  }
174 
175  while (true) {
176  if (resetB) {
177  // Reset the Hessian approximation
178  _pk.noalias() = -_gk;
179  }
180 
181  // Get an initial guess for the step size (alpha)
182  if (_itNum > 1 && resetB != 2) {
183  // use cubic interpolation based on the previous step
184  _alpha0 = _alpha = std::min(1.0,
185  1.01*CubicInterp(_gk_1.dot(_pk_1),
186  _alphak_1,
187  _fk - _fk_1,
188  _gk.dot(_pk_1),
189  _ls_opts.minAlpha, 1.0));
190  }
191  else {
192  // On the first step (or, after a reset) use the default step size
193  _alpha0 = _alpha = _ls_opts.alpha0;
194  }
195 
196  // Perform the line search. If successful, the results are in the
197  // variables: _xk_1, _fk_1 and _gk_1.
199  _pk, _xk, _fk, _gk,
200  _ls_opts.c1, _ls_opts.c2,
201  _ls_opts.minAlpha);
202  if (retCode) {
203  // Line search failed...
204  if (resetB) {
205  // did a Hessian reset and it still failed, and nothing left to try
206  retCode = TERM_LSFAIL;
207  return retCode;
208  }
209  else {
210  // try resetting the Hessian approximation
211  resetB = 2;
212  _note += "LS failed, Hessian reset";
213  continue;
214  }
215  }
216  else {
217  break;
218  }
219  }
220 
221  // Swap things so that k is the most recent iterate
222  std::swap(_fk,_fk_1);
223  _xk.swap(_xk_1);
224  _gk.swap(_gk_1);
225  _pk.swap(_pk_1);
226 
227  sk.noalias() = _xk - _xk_1;
228  yk.noalias() = _gk - _gk_1;
229 
230  gradNorm = _gk.norm();
231  stepNorm = sk.norm();
232 
233  // Update QN approximation
234  if (resetB) {
235  // If the QN approximation was reset, automatically scale it
236  // and update the step-size accordingly
237  Scalar B0fact = _qn.update(yk,sk,true);
238  _pk_1 /= B0fact;
239  _alphak_1 = _alpha*B0fact;
240  }
241  else {
242  _qn.update(yk,sk);
243  _alphak_1 = _alpha;
244  }
245 
246 
247  // Compute search direction for next step
248  _qn.search_direction(_pk,_gk);
249 
250  // Check for convergence
251  if (std::fabs(_fk_1 - _fk) < _conv_opts.tolAbsF) {
252  retCode = TERM_ABSF; // Objective function improvement wasn't sufficient
253  }
254  else if (gradNorm < _conv_opts.tolAbsGrad) {
255  retCode = TERM_ABSGRAD; // Gradient norm was below threshold
256  }
257  else if (stepNorm < _conv_opts.tolAbsX) {
258  retCode = TERM_ABSX; // Change in x was too small
259  }
260  else if (_itNum >= _conv_opts.maxIts) {
261  retCode = TERM_MAXIT; // Max number of iterations hit
262  }
263  else if (rel_obj_decrease() < _conv_opts.tolRelF*std::numeric_limits<Scalar>::epsilon()) {
264  retCode = TERM_RELF; // Relative improvement in objective function wasn't sufficient
265  }
266  else if (rel_grad_norm() < _conv_opts.tolRelGrad*std::numeric_limits<Scalar>::epsilon()) {
267  retCode = TERM_RELGRAD; // Relative gradient norm was below threshold
268  }
269  else {
270  retCode = TERM_SUCCESS; // Step was successful more progress to be made
271  }
272 
273  return retCode;
274  }
275 
276  int minimize(VectorT &x0) {
277  int retcode;
278  initialize(x0);
279  while (!(retcode = step()));
280  x0 = _xk;
281  return retcode;
282  }
283  };
284 
285 
286 
287  template <typename M>
288  double lp_no_jacobian(const M& model,
289  std::vector<double>& params_r,
290  std::vector<int>& params_i,
291  std::ostream* o = 0) {
292  // FIXME: is this supposed to return the log probability from the model?
293  return 0;
294  }
295 
296  template <class M>
297  class ModelAdaptor {
298  private:
299  M& _model;
300  std::vector<int> _params_i;
301  std::ostream* _msgs;
302  std::vector<double> _x, _g;
303  size_t _fevals;
304 
305  public:
306  ModelAdaptor(M& model,
307  const std::vector<int>& params_i,
308  std::ostream* msgs)
309  : _model(model), _params_i(params_i), _msgs(msgs), _fevals(0) {}
310 
311  size_t fevals() const { return _fevals; }
312  int operator()(const Eigen::Matrix<double,Eigen::Dynamic,1> &x,
313  double &f) {
314  using Eigen::Matrix;
315  using Eigen::Dynamic;
318  typedef typename index_type<Matrix<double,Dynamic,1> >::type idx_t;
319 
320  _x.resize(x.size());
321  for (idx_t i = 0; i < x.size(); i++)
322  _x[i] = x[i];
323 
324  try {
325  f = - log_prob_propto<false>(_model, _x, _params_i, _msgs);
326  } catch (const std::exception& e) {
327  if (_msgs)
328  (*_msgs) << e.what() << std::endl;
329  return 1;
330  }
331 
332  if (boost::math::isfinite(f))
333  return 0;
334  else {
335  if (_msgs)
336  *_msgs << "Error evaluating model log probability: "
337  "Non-finite function evaluation." << std::endl;
338  return 2;
339  }
340  }
341  int operator()(const Eigen::Matrix<double,Eigen::Dynamic,1> &x,
342  double &f,
343  Eigen::Matrix<double,Eigen::Dynamic,1> &g) {
344  using Eigen::Matrix;
345  using Eigen::Dynamic;
348  typedef typename index_type<Matrix<double,Dynamic,1> >::type idx_t;
349 
350  _x.resize(x.size());
351  for (idx_t i = 0; i < x.size(); i++)
352  _x[i] = x[i];
353 
354  _fevals++;
355 
356  try {
357  f = - log_prob_grad<true,false>(_model, _x, _params_i, _g, _msgs);
358  } catch (const std::exception& e) {
359  if (_msgs)
360  (*_msgs) << e.what() << std::endl;
361  return 1;
362  }
363 
364  g.resize(_g.size());
365  for (size_t i = 0; i < _g.size(); i++) {
366  if (!boost::math::isfinite(_g[i])) {
367  if (_msgs)
368  *_msgs << "Error evaluating model log probability: "
369  "Non-finite gradient." << std::endl;
370  return 3;
371  }
372  g[i] = -_g[i];
373  }
374 
375  if (boost::math::isfinite(f))
376  return 0;
377  else {
378  if (_msgs)
379  *_msgs << "Error evaluating model log probability: "
380  "Non-finite function evaluation."
381  << std::endl;
382  return 2;
383  }
384  }
385  int df(const Eigen::Matrix<double,Eigen::Dynamic,1> &x,
386  Eigen::Matrix<double,Eigen::Dynamic,1> &g) {
387  double f;
388  return (*this)(x,f,g);
389  }
390  };
391 
392  template<typename M, typename QNUpdateType, typename Scalar = double,
393  int DimAtCompile = Eigen::Dynamic>
394  class BFGSLineSearch : public BFGSMinimizer<ModelAdaptor<M>,QNUpdateType,Scalar,DimAtCompile> {
395  private:
396  ModelAdaptor<M> _adaptor;
397  public:
398  typedef BFGSMinimizer<ModelAdaptor<M>,QNUpdateType,Scalar,DimAtCompile> BFGSBase;
399  typedef typename BFGSBase::VectorT vector_t;
401 
402  BFGSLineSearch(M& model,
403  const std::vector<double>& params_r,
404  const std::vector<int>& params_i,
405  std::ostream* msgs = 0)
406  : BFGSBase(_adaptor),
407  _adaptor(model,params_i,msgs)
408  {
409  initialize(params_r);
410  }
411 
412  void initialize(const std::vector<double>& params_r) {
413  Eigen::Matrix<double,Eigen::Dynamic,1> x;
414  x.resize(params_r.size());
415  for (size_t i = 0; i < params_r.size(); i++)
416  x[i] = params_r[i];
418  }
419 
420  size_t grad_evals() { return _adaptor.fevals(); }
421  double logp() { return -(this->curr_f()); }
422  double grad_norm() { return this->curr_g().norm(); }
423  void grad(std::vector<double>& g) {
424  const vector_t &cg(this->curr_g());
425  g.resize(cg.size());
426  for (idx_t i = 0; i < cg.size(); i++)
427  g[i] = -cg[i];
428  }
429  void params_r(std::vector<double>& x) {
430  const vector_t &cx(this->curr_x());
431  x.resize(cx.size());
432  for (idx_t i = 0; i < cx.size(); i++)
433  x[i] = cx[i];
434  }
435  };
436 
437  }
438 
439 }
440 
441 #endif
const size_t iter_num() const
Definition: bfgs.hpp:116
fvar< T > fabs(const fvar< T > &x)
Definition: fabs.hpp:18
void params_r(std::vector< double > &x)
Definition: bfgs.hpp:429
QNUpdateType & get_qnupdate()
Definition: bfgs.hpp:91
const Scalar & alpha0() const
Definition: bfgs.hpp:114
ModelAdaptor(M &model, const std::vector< int > &params_i, std::ostream *msgs)
Definition: bfgs.hpp:306
double log_prob_grad(const M &model, std::vector< double > &params_r, std::vector< int > &params_i, std::vector< double > &gradient, std::ostream *msgs=0)
Compute the gradient using reverse-mode automatic differentiation, writing the result into the specif...
Definition: util.hpp:83
const VectorT & curr_x() const
Definition: bfgs.hpp:95
ConvergenceOptions< Scalar > _conv_opts
Definition: bfgs.hpp:89
const VectorT & prev_g() const
Definition: bfgs.hpp:101
BFGSMinimizer< ModelAdaptor< M >, QNUpdateType, Scalar, DimAtCompile > BFGSBase
Definition: bfgs.hpp:398
BFGSMinimizer(FunctorType &f)
Definition: bfgs.hpp:143
bool isfinite(const stan::agrad::var &v)
Checks if the given number has finite value.
int WolfeLineSearch(FunctorType &func, Scalar &alpha, XType &x1, Scalar &f1, XType &gradx1, const XType &p, const XType &x0, const Scalar &f0, const XType &gradx0, const Scalar &c1, const Scalar &c2, const Scalar &minAlpha)
Perform a line search which finds an approximate solution to: satisfying the strong Wolfe conditions...
stan::math::index_type< vector_t >::type idx_t
Definition: bfgs.hpp:400
double max(const double a, const double b)
Definition: max.hpp:7
const Scalar & prev_f() const
Definition: bfgs.hpp:99
Eigen::Matrix< Scalar, DimAtCompile, 1 > VectorT
Definition: bfgs.hpp:75
int minimize(VectorT &x0)
Definition: bfgs.hpp:276
const std::string & note() const
Definition: bfgs.hpp:118
std::string get_code_string(int retCode)
Definition: bfgs.hpp:120
void grad(std::vector< double > &g)
Definition: bfgs.hpp:423
const VectorT & prev_p() const
Definition: bfgs.hpp:102
void initialize(const VectorT &x0)
Definition: bfgs.hpp:145
Primary template class for the metaprogram to compute the index type of a container.
Definition: index_type.hpp:21
int df(const Eigen::Matrix< double, Eigen::Dynamic, 1 > &x, Eigen::Matrix< double, Eigen::Dynamic, 1 > &g)
Definition: bfgs.hpp:385
Scalar rel_obj_decrease() const
Definition: bfgs.hpp:108
BFGSBase::VectorT vector_t
Definition: bfgs.hpp:399
const QNUpdateType & get_qnupdate() const
Definition: bfgs.hpp:92
Scalar prev_step_size() const
Definition: bfgs.hpp:103
int operator()(const Eigen::Matrix< double, Eigen::Dynamic, 1 > &x, double &f, Eigen::Matrix< double, Eigen::Dynamic, 1 > &g)
Definition: bfgs.hpp:341
const Scalar & alpha() const
Definition: bfgs.hpp:115
Eigen::Matrix< Scalar, DimAtCompile, DimAtCompile > HessianT
Definition: bfgs.hpp:76
Scalar CubicInterp(const Scalar &df0, const Scalar &x1, const Scalar &f1, const Scalar &df1, const Scalar &loX, const Scalar &hiX)
Find the minima in an interval [loX, hiX] of a cubic function which interpolates the points...
LSOptions< Scalar > _ls_opts
Definition: bfgs.hpp:88
double e()
Return the base of the natural logarithm.
Definition: constants.hpp:86
const VectorT & prev_x() const
Definition: bfgs.hpp:100
BFGSLineSearch(M &model, const std::vector< double > &params_r, const std::vector< int > &params_i, std::ostream *msgs=0)
Definition: bfgs.hpp:402
const VectorT & curr_p() const
Definition: bfgs.hpp:97
const Scalar & curr_f() const
Definition: bfgs.hpp:94
double min(const double a, const double b)
Definition: min.hpp:7
void initialize(const std::vector< double > &params_r)
Definition: bfgs.hpp:412
int operator()(const Eigen::Matrix< double, Eigen::Dynamic, 1 > &x, double &f)
Definition: bfgs.hpp:312
Scalar rel_grad_norm() const
Definition: bfgs.hpp:105
const VectorT & curr_g() const
Definition: bfgs.hpp:96
double lp_no_jacobian(const M &model, std::vector< double > &params_r, std::vector< int > &params_i, std::ostream *o=0)
Definition: bfgs.hpp:288
double log_prob_propto(const M &model, std::vector< double > &params_r, std::vector< int > &params_i, std::ostream *msgs=0)
Helper function to calculate log probability for double scalars up to a proportion.
Definition: util.hpp:41

     [ Stan Home Page ] © 2011–2014, Stan Development Team.