diff --git a/doc/html/indexes/s01.html b/doc/html/indexes/s01.html index dbd2752c3..493ce79f1 100644 --- a/doc/html/indexes/s01.html +++ b/doc/html/indexes/s01.html @@ -1022,9 +1022,9 @@
  • Complements are supported too - and when to use them

  • Confidence Intervals on the Standard Deviation

  • Error Handling Example

  • -
  • Gradient Desccent

  • +
  • Gradient Descent

  • L-BFGS

  • -
  • Nesterov Accelerated Gradient Desccent

  • +
  • Nesterov Accelerated Gradient Descent

  • Reverse Mode Automatic Differentiation

  • Testing a sample mean for difference from a "true" mean

  • @@ -1616,7 +1616,7 @@
  • Error Function Inverses

  • Finding Zeros of Airy Functions

  • Generic operations common to all distributions are non-member functions

  • -
  • Gradient Desccent

  • +
  • Gradient Descent

  • Heuman Lambda Function

  • Implementation

  • Inverse Chi Squared Distribution

  • @@ -2759,7 +2759,7 @@
  • make_gradient_descent

    -
    +
  • make_lbfgs

    @@ -2767,7 +2767,7 @@
  • make_nag

    -
    +
  • mapairy_distribution

    @@ -2987,7 +2987,7 @@
  • norm

  • @@ -3800,11 +3800,11 @@
  • step

  • students_t_distribution

  • @@ -4042,9 +4042,9 @@
  • unif01

  • @@ -4135,7 +4135,7 @@
  • Empirical Cumulative Distribution Function

  • Lanczos Smoothing Derivatives

  • Modified Bessel Functions of the First and Second Kinds

  • -
  • Nesterov Accelerated Gradient Desccent

  • +
  • Nesterov Accelerated Gradient Descent

  • Noncentral Chi-Squared Distribution

  • Spherical Bessel Functions of the First and Second Kinds

  • t -tests

  • diff --git a/doc/html/indexes/s02.html b/doc/html/indexes/s02.html index 5bfa101d1..81da5381e 100644 --- a/doc/html/indexes/s02.html +++ b/doc/html/indexes/s02.html @@ -34,7 +34,7 @@
    @@ -4723,6 +4727,8 @@
    @@ -5410,23 +5416,21 @@
  • -

    Gradient Desccent

    +

    Gradient Descent

  • -
  • -

    gradient_descent

    - -
  • +
  • gradient_descent

  • gradient_norm_convergence_policy

    @@ -6158,6 +6162,7 @@
    Introduction
    -
    Gradient Desccent
    +
    Gradient Descent
    Nesterov Accelerated Gradient - Desccent
    + Descent
    L-BFGS
    minimize
    Reverse Mode autodiff policies
    diff --git a/doc/html/math_toolkit/gd_opt/gradient_descent.html b/doc/html/math_toolkit/gd_opt/gradient_descent.html index e7be9ea96..6df1e2b13 100644 --- a/doc/html/math_toolkit/gd_opt/gradient_descent.html +++ b/doc/html/math_toolkit/gd_opt/gradient_descent.html @@ -1,13 +1,13 @@ -Gradient Desccent +Gradient Descent - + @@ -25,7 +25,7 @@
    @@ -101,7 +101,7 @@

    where lr is a user defined - learning rate. For a more complete decription of the theoretical principle + learning rate. For a more complete description of the theoretical principle check the wikipedia page

    @@ -123,7 +123,7 @@
    • Objective&& - obj : objective funciton to + obj : objective function to minimize
    • @@ -134,15 +134,21 @@ RealType& lr : learning rate. A larger value takes larger steps during descent, leading to faster, but more - unstable convergence. Conversely, small vaues are more stable but take + unstable convergence. Conversely, small values are more stable but take longer to converge.
    • InitializationPolicy&& ip - : Initialization policy for ArgumentContainer, - or the initial guess. By default it is set to tape_initializer_rvar<RealType> which lets the user provide the "initial - guess" by setting the values of x - manually. For more info check the Policies section. + : Initialization policy for optimizer state and variables. Users may + supply a custom initialization policy to control how the argument container + and any AD specific runtime state : i.e. reverse-mode tape attachment/reset + are initialized. By default, the optimizer uses the user-provided initial + values in x and performs the standard reverse mode AD initialization + required for gradient evaluation. Custom initialization policies are + useful for randomized starts, non rvar AD types, or when gradients are + supplied externally. See the reverse-mode autodiff policy documentation + for the required initialization policy interface when writing custom + policies.
    • ObjectiveEvalPolicy&& @@ -151,7 +157,7 @@
    • GradEvalPolicy&& - gep : tells the optimzier how + gep : tells the optimizer how to evaluate the gradient of the objective function. By default reverse_mode_gradient_evaluation_policy<RealType>
    @@ -173,8 +179,8 @@

    - The code below manually minimizes the abover potential energy function for - N particles over their two angular pozitions. + The code below manually minimizes the above potential energy function for + N particles over their two angular positions.

    #include <boost/math/differentiation/autodiff_reverse.hpp>
     #include <boost/math/optimization/gradient_descent.hpp>
    @@ -319,7 +325,7 @@
     
    const double lr     = 1e-3;

    is the optimizer learning rate. Using the code the way its written, the optimizer - runs for 100000 steps. Running tthe program with + runs for 100000 steps. Running the program with

    ./thomson_sphere N
     
    @@ -332,7 +338,7 @@

    Below is a plot of the final energy of the system, and its deviation from - the theoretically predicted values. The table of theorical energy values + the theoretically predicted values. The table of theoretical energy values for the problem is from wikipedia.

    @@ -346,7 +352,7 @@

    Often, we don't want to actually implement our own stepping function, i.e. we care about certain convergence criteria. In the above example, we need - to include the minimier.hpp header: + to include the minimizer.hpp header:

    #include <boost/math/optimization/minimizer.hpp>
     
    diff --git a/doc/html/math_toolkit/gd_opt/introduction.html b/doc/html/math_toolkit/gd_opt/introduction.html index a3a301e17..dd3794f43 100644 --- a/doc/html/math_toolkit/gd_opt/introduction.html +++ b/doc/html/math_toolkit/gd_opt/introduction.html @@ -7,7 +7,7 @@ - + @@ -28,13 +28,713 @@ Introduction

    - Gradient based optimizers are algorithms that use the gradient of a funciton + Gradient based optimizers are algorithms that use the gradient of a function to iteratively find locally extreme points of functions over a set of parameters. This sections provides a description of a set of gradient optimizers. The optimizers are written with boost::math::differentiation::reverse_mode::rvar in mind, however if a way to evaluate the funciton and its gradient is provided, the optimizers should work in exactly the same way.

    +

    + Below is a table that summarizes the intended usage patterns of the provided + optimizers and policies, and is meant as a practical guide rather than a + strict prescription: +

    +

    + + List + of Optimizers +

    +
    ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    + Optimizer +

    +
    +

    + Order +

    +
    +

    + Uses Curvature +

    +
    +

    + Memory Cost +

    +
    +

    + Intended Problem Class +

    +
    +

    + When to Use +

    +
    +

    + gradient descent +

    +
    +

    + first +

    +
    +

    + no +

    +
    +

    + low +

    +
    +

    + Smooth, well-scaled objectives +

    +
    +

    + Baseline method; debugging; when behavior transparency matters +

    +
    +

    + nesterov accelerated gradient +

    +
    +

    + first +

    +
    +

    + no +

    +
    +

    + low +

    +
    +

    + Ill-conditioned or narrow-valley problems +

    +
    +

    + When plain gradient descent converges slowly or oscillates +

    +
    +

    + L-BFGS +

    +
    +

    + quasi second order +

    +
    +

    + approximate +

    +
    +

    + medium +

    +
    +

    + Smooth, deterministic objectives +

    +
    +

    + When gradients are reliable and faster convergence is needed +

    +
    +

    + + Optimizer + Policies +

    +
    + + Initialization + Policies +
    +
    +++++ + + + + + + + + + + + + + + + + + + + + + + +
    +

    + Policy +

    +
    +

    + Use case +

    +
    +

    + Responsibilities +

    +
    +

    + tape_initializer_rvar +

    +
    +

    + User initialzes all varibles manually +

    +
    +

    + initializes tape +

    +
    +

    + random_uniform_initializer_rvar +

    +
    +

    + Initializes all variables with a random number between a min and + max value +

    +
    +

    + Initializes variables. Initializes tape. +

    +
    +

    + costant_initializer_rvar +

    +
    +

    + Initializes all variables with a constant +

    +
    +

    + Initializes variables. Initializes tape. +

    +
    +
    + + Evaluation + Policies +
    +
    +++++ + + + + + + + + + + + + + + + + + +
    +

    + Policy +

    +
    +

    + Use case +

    +
    +

    + Responsibilities +

    +
    +

    + reverse_mode_function_eval_policy +

    +
    +

    + Default. User with boost reverse mode autodiff +

    +
    +

    + tells the optimizer how to evaluate the objective +

    +
    +

    + reverse_mode_gradient_evaluation_policy +

    +
    +

    + Default. User with boost reverse mode autodiff +

    +
    +

    + tells the optimizer how to evaluate the gradients of an objective +

    +
    +

    + These policies are intended to use with boost reverse mode autodiff. If you + need to use the optimizers with a custom AD variable, or by providing the + gradient of an objective manually, check the docs for policies to see how + the policies are implemented. +

    +

    + + LBFGS line + search policies +

    +

    + the table below summarizes the two line search policies provided for use + with LBFGS. +

    +
    +++++++ + + + + + + + + + + + + + + + + + + + + + + + +
    +

    + Policy +

    +
    +

    + Enforced Conditions +

    +
    +

    + Per iteration cost +

    +
    +

    + Convergence +

    +
    +

    + Use case +

    +
    +

    + Strong Wolfe +

    +
    +

    + function decrease. curvature condition +

    +
    +

    + higher +

    +
    +

    + faster +

    +
    +

    + most of the time +

    +
    +

    + Armijo +

    +
    +

    + function decrease only +

    +
    +

    + lower +

    +
    +

    + slower +

    +
    +

    + you know what you're doing +

    +
    +

    + + Minimizer + Policies +

    +
    + + Convergence + Policies +
    +
    +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    + Policy +

    +
    +

    + Criterion +

    +
    +

    + When to Use +

    +
    +

    + gradient_norm_convergence_policy +

    +
    +

    + gradient norm < tol +

    +
    +

    + Default. Stationarity based condition +

    +
    +

    + objective_tol_convergence_policy +

    +
    +

    + absolute difference between objective steps is small +

    +
    +

    + Well-scaled objectives +

    +
    +

    + relative_objective_tol_policy +

    +
    +

    + relative difference between objective steps is small +

    +
    +

    + Scale-invariant convergence +

    +
    +

    + combined_convergence_policy +

    +
    +

    + logical combination OR +

    +
    +

    + you need a combination of convergence conditions +

    +
    +
    + + Termination + Policies +
    +
    +++++ + + + + + + + + + + + + + + + + + +
    +

    + Policy +

    +
    +

    + Controls +

    +
    +

    + When to Use +

    +
    +

    + max_iter_termination_policy +

    +
    +

    + iteration count +

    +
    +

    + Hard safety bound (almost always recommended) +

    +
    +

    + wallclock_termination_policy +

    +
    +

    + wall clock time +

    +
    +

    + benchmarking, real-time constraints +

    +
    +
    + + Constraint + and Projection Policies +
    +
    ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    + Policy +

    +
    +

    + Constraint Type +

    +
    +

    + unconstrained_policy +

    +
    +

    + No constraint +

    +
    +

    + box_constraints +

    +
    +

    + upper/lower bound clip +

    +
    +

    + nonnegativity_constraint +

    +
    +

    + set everything below 0, to 0 +

    +
    +

    + l2_ball_constraint +

    +
    +

    + 2-norm(x) < r +

    +
    +

    + l1_ball_constraint +

    +
    +

    + 1-norm(x) < r +

    +
    +

    + simplex_constraint +

    +
    +

    + Probability simplex +

    +
    +

    + function_constraint +

    +
    +

    + custom user provided function wrapper +

    +
    +

    + unit_sphere_constraint +

    +
    +

    + 2-norm(x) = 1 +

    +
  • InitializationPolicy&& ip - : initialization policy for ArgumentContainer and optimizer state. For - reverse-mode AD, the default typically initializes/attaches the tape - and uses the user-provided initial values in x. + : Initialization policy for optimizer state and variables. Users may + supply a custom initialization policy to control how the argument container + and any AD-specific runtime state : i.e. reverse-mode tape attachment/reset + are initialized. By default, the optimizer uses the same initialization + as gradient descent, taking the user provided initial values in x and + initializing the internal momentum/velocity state to zero. Custom initialization + policies are useful for randomized starts, non rvar AD types, or when + gradients are supplied externally. Check the docs for Reverse Mode autodiff + policies for initialization policy structure to write custom policies.
  • ObjectiveEvalPolicy&& @@ -207,16 +213,21 @@ evaluation policy when using rvar.
  • +

    LineSearchPolicy&& lsp : policy for selecting - the step length alpha. Default is Strong Wolfe, but Armijo is an option. -

  • - + the step length alpha along a search direction. Determines the acceptance + criteria and how many function/gradient evaluations may be performed + during a step Default is Strong Wolfe, but Armijo is an option. Strong + Wolfe uses both function and gradient information to ensure good curvature + conditions, while Armijo relies only on function decrease and is simpler + but less robust for quasi-Newton methods. +

    - Notes -
    -
      + Notes + +
    • LBFGS assumes the objective is sufficiently smooth for gradients to be informative. It is typically most effective on unconstrained smooth problems. diff --git a/doc/html/math_toolkit/gd_opt/nesterov.html b/doc/html/math_toolkit/gd_opt/nesterov.html index a77ca1a50..8152ed5d4 100644 --- a/doc/html/math_toolkit/gd_opt/nesterov.html +++ b/doc/html/math_toolkit/gd_opt/nesterov.html @@ -1,12 +1,12 @@ -Nesterov Accelerated Gradient Desccent +Nesterov Accelerated Gradient Descent - + @@ -25,8 +25,8 @@
    @@ -78,7 +78,7 @@ }; /* Convenience overloads */ -/* make nesterov acelerated gradient descent by providing +/* make nesterov accelerated gradient descent by providing ** objective function ** variables to optimize over ** Optionally @@ -106,7 +106,7 @@ InitializationPolicy&& ip); /* provide - * initilaization policy + * initialization policy * objective evaluation policy * gradient evaluation policy */ @@ -175,7 +175,7 @@
  • RealType lr : learning rate. Larger values take larger steps (faster but potentially - unsable). Smaller values are more stable but converge more slowly. + unstable). Smaller values are more stable but converge more slowly.
  • RealType mu @@ -184,10 +184,15 @@
  • InitializationPolicy&& ip - : initialization policy for the optimizer state and variables. For NAG, - this also initializes the internal momentum/velocity state. By default - the optimizer uses the same initializer as gradient descent and initializes - velocity to zero. + : Initialization policy for optimizer state and variables. Users may + supply a custom initialization policy to control how the argument container + and any AD-specific runtime state : i.e. reverse-mode tape attachment/reset + are initialized. By default, the optimizer uses the same initialization + as gradient descent, taking the user provided initial values in x and + initializing the internal momentum/velocity state to zero. Custom initialization + policies are useful for randomized starts, non rvar AD types, or when + gradients are supplied externally. Check the docs for Reverse Mode autodiff + policies for initialization policy structure to write custom policies.
  • ObjectiveEvalPolicy&& diff --git a/doc/html/optimization.html b/doc/html/optimization.html index b7f7ef96e..56db63645 100644 --- a/doc/html/optimization.html +++ b/doc/html/optimization.html @@ -37,9 +37,9 @@
    Gradient Based Optimizers
    Introduction
    -
    Gradient Desccent
    +
    Gradient Descent
    Nesterov Accelerated Gradient - Desccent
    + Descent
    L-BFGS
    minimize
    Reverse Mode autodiff policies
    diff --git a/doc/optimization/gradient_optimizers.qbk b/doc/optimization/gradient_optimizers.qbk index 9448ed751..f9c0001bd 100644 --- a/doc/optimization/gradient_optimizers.qbk +++ b/doc/optimization/gradient_optimizers.qbk @@ -13,6 +13,70 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) [section:introduction Introduction] Gradient based optimizers are algorithms that use the gradient of a function to iteratively find locally extreme points of functions over a set of parameters. This sections provides a description of a set of gradient optimizers. The optimizers are written with `boost::math::differentiation::reverse_mode::rvar` in mind, however if a way to evaluate the funciton and its gradient is provided, the optimizers should work in exactly the same way. +Below is a table that summarizes the intended usage patterns of the provided optimizers and policies, and is meant as a practical guide rather than a strict prescription: + +[h1:table-optimizers List of Optimizers] +[table + [[Optimizer] [Order] [Uses Curvature] [Memory Cost] [Intended Problem Class] [When to Use]] + [[gradient descent] [first] [no] [low] [Smooth, well-scaled objectives] [Baseline method; debugging; when behavior transparency matters]] + [[nesterov accelerated gradient] [first] [no] [low] [Ill-conditioned or narrow-valley problems] [When plain gradient descent converges slowly or oscillates]] + [[L-BFGS] [quasi second order] [approximate] [medium] [Smooth, deterministic objectives] [When gradients are reliable and faster convergence is needed]] +] + +[h1:table-optimizer-policies Optimizer Policies] +[heading Initialization Policies] +[table + [[Policy] [Use case] [Responsibilities]] + [[tape_initializer_rvar] [User initialzes all varibles manually] [initializes tape]] + [[random_uniform_initializer_rvar] [Initializes all variables with a random number between a min and max value] [Initializes variables. Initializes tape.]] + [[costant_initializer_rvar] [Initializes all variables with a constant] [Initializes variables. Initializes tape.]] +] + +[heading Evaluation Policies] +[table + [[Policy] [Use case] [Responsibilities]] + [[reverse_mode_function_eval_policy] [Default. User with boost reverse mode autodiff] [tells the optimizer how to evaluate the objective]] + [[reverse_mode_gradient_evaluation_policy] [Default. User with boost reverse mode autodiff] [tells the optimizer how to evaluate the gradients of an objective]] +] + +These policies are intended to use with boost reverse mode autodiff. If you need to use the optimizers with a custom AD variable, or by providing the gradient of an objective manually, check the docs for policies to see how the policies are implemented. + +[h1:line-search-policies LBFGS line search policies] +the table below summarizes the two line search policies provided for use with LBFGS. +[table + [[Policy] [Enforced Conditions] [Per iteration cost] [Convergence] [Use case]] + [[Strong Wolfe] [function decrease. curvature condition] [higher] [faster] [most of the time]] + [[Armijo] [function decrease only] [lower] [slower] [you know what you're doing]] +] +[h1:minimizer-policies Minimizer Policies] +[heading Convergence Policies] +[table + [[Policy] [Criterion] [When to Use]] + [[gradient_norm_convergence_policy] [gradient norm < tol] [Default. Stationarity based condition]] + [[objective_tol_convergence_policy] [absolute difference between objective steps is small] [Well-scaled objectives]] + [[relative_objective_tol_policy] [relative difference between objective steps is small] [Scale-invariant convergence]] + [[combined_convergence_policy] [logical combination OR] [you need a combination of convergence conditions]] +] + +[heading Termination Policies] +[table + [[Policy] [Controls] [When to Use]] + [[max_iter_termination_policy] [iteration count] [Hard safety bound (almost always recommended)]] + [[wallclock_termination_policy] [wall clock time] [benchmarking, real-time constraints]] +] + +[heading Constraint and Projection Policies] +[table + [[Policy] [Constraint Type]] + [[unconstrained_policy] [No constraint]] + [[box_constraints] [upper/lower bound clip]] + [[nonnegativity_constraint] [set everything below 0, to 0]] + [[l2_ball_constraint] [ 2-norm(x) < r]] + [[l1_ball_constraint] [ 1-norm(x) < r]] + [[simplex_constraint] [Probability simplex]] + [[function_constraint] [custom user provided function wrapper]] + [[unit_sphere_constraint] [2-norm(x) = 1]] +] [endsect] [/section:introduction] [section:gradient_descent Gradient Descent] diff --git a/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp b/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp index a00a4c1d6..8fd3c2a08 100644 --- a/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp +++ b/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp @@ -5,7 +5,6 @@ #ifndef BOOST_MATH_OPTIMIZATION_DETAIL_DIFFERENTIABLE_OPT_UTILITIES_HPP #define BOOST_MATH_OPTIMIZATION_DETAIL_DIFFERENTIABLE_OPT_UTILITIES_HPP #include -#include #include #include #include @@ -144,11 +143,11 @@ random_vector(size_t n) /** @brief> generates a random std::vector of size n * using mt19937 algorithm */ - static boost::random::mt19937 rng{ std::random_device{}() }; - static boost::random::uniform_real_distribution dist(0.0, 1.0); + static std::mt19937 rng{ std::random_device{}() }; + static std::uniform_real_distribution dist(0.0, 1.0); std::vector result(n); - std::generate(result.begin(), result.end(), [&] { return dist(rng); }); + std::generate(result.begin(), result.end(), [&] { return static_cast(dist(rng)); }); return result; } diff --git a/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp b/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp index dbb365b5e..b4cad4b00 100644 --- a/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp +++ b/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp @@ -8,7 +8,6 @@ #include #include #include -#include namespace boost { namespace math { @@ -102,10 +101,10 @@ struct random_uniform_initializer_rvar template void operator()(ArgumentContainer& x) const { - static boost::random::mt19937 gen{ std::random_device{}() }; - static boost::random::uniform_real_distribution dist(0.0, 1.0); + static std::mt19937 gen{ std::random_device{}() }; + static std::uniform_real_distribution dist(0.0, 1.0); for (auto& xi : x) { - xi = rdiff::rvar(dist(gen)); + xi = rdiff::rvar(static_cast(dist(gen))); } auto& tape = rdiff::get_active_tape(); tape.add_checkpoint();