diff --git a/doc/html/indexes/s01.html b/doc/html/indexes/s01.html index dbd2752c3..493ce79f1 100644 --- a/doc/html/indexes/s01.html +++ b/doc/html/indexes/s01.html @@ -1022,9 +1022,9 @@
Generic operations common to all distributions are non-member functions
make_gradient_descent
- +make_lbfgs
@@ -2767,7 +2767,7 @@make_nag
- +mapairy_distribution
@@ -2987,7 +2987,7 @@norm
Graphing, Profiling, and Generating Test Data for Special Functions
step
unif01
abstract_optimizer
- +gradient_descent
- -gradient_norm_convergence_policy
@@ -377,11 +374,11 @@nesterov_accelerated_gradient
- +nesterov_update_policy
- +nonfinite_num_get
@@ -513,9 +510,9 @@rvar
abstract_optimizer
- +accuracy
@@ -696,6 +696,10 @@ @@ -3317,13 +3321,13 @@Generic operations common to all distributions are non-member functions
Gradient Desccent
+Gradient Descent
gradient_descent
- -gradient_norm_convergence_policy
@@ -6158,6 +6162,7 @@L-BFGS
make_gradient_descent
- +make_lbfgs
@@ -7405,7 +7412,7 @@make_nag
- +makima
@@ -7801,29 +7808,31 @@Nesterov Accelerated Gradient Desccent
+Nesterov Accelerated Gradient Descent
nesterov_accelerated_gradient
- +nesterov_update_policy
- +newton_raphson_iterate
@@ -8009,7 +8018,7 @@norm
rvar
Graphing, Profiling, and Generating Test Data for Special Functions
step
unif01
where lr is a user defined
- learning rate. For a more complete decription of the theoretical principle
+ learning rate. For a more complete description of the theoretical principle
check the wikipedia
page
Objective&&
- obj : objective funciton to
+ obj : objective function to
minimize
RealType&
lr : learning rate. A larger
value takes larger steps during descent, leading to faster, but more
- unstable convergence. Conversely, small vaues are more stable but take
+ unstable convergence. Conversely, small values are more stable but take
longer to converge.
InitializationPolicy&& ip
- : Initialization policy for ArgumentContainer,
- or the initial guess. By default it is set to tape_initializer_rvar<RealType> which lets the user provide the "initial
- guess" by setting the values of x
- manually. For more info check the Policies section.
+ : Initialization policy for optimizer state and variables. Users may
+ supply a custom initialization policy to control how the argument container
+ and any AD specific runtime state : i.e. reverse-mode tape attachment/reset
+ are initialized. By default, the optimizer uses the user-provided initial
+ values in x and performs the standard reverse mode AD initialization
+ required for gradient evaluation. Custom initialization policies are
+ useful for randomized starts, non rvar AD types, or when gradients are
+ supplied externally. See the reverse-mode autodiff policy documentation
+ for the required initialization policy interface when writing custom
+ policies.
ObjectiveEvalPolicy&&
@@ -151,7 +157,7 @@
GradEvalPolicy&&
- gep : tells the optimzier how
+ gep : tells the optimizer how
to evaluate the gradient of the objective function. By default reverse_mode_gradient_evaluation_policy<RealType>
- The code below manually minimizes the abover potential energy function for - N particles over their two angular pozitions. + The code below manually minimizes the above potential energy function for + N particles over their two angular positions.
#include <boost/math/differentiation/autodiff_reverse.hpp> #include <boost/math/optimization/gradient_descent.hpp> @@ -319,7 +325,7 @@const double lr = 1e-3;is the optimizer learning rate. Using the code the way its written, the optimizer - runs for 100000 steps. Running tthe program with + runs for 100000 steps. Running the program with
./thomson_sphere N@@ -332,7 +338,7 @@Below is a plot of the final energy of the system, and its deviation from - the theoretically predicted values. The table of theorical energy values + the theoretically predicted values. The table of theoretical energy values for the problem is from wikipedia.
@@ -346,7 +352,7 @@
Often, we don't want to actually implement our own stepping function, i.e. we care about certain convergence criteria. In the above example, we need - to include the minimier.hpp header: + to include the minimizer.hpp header:
#include <boost/math/optimization/minimizer.hpp>diff --git a/doc/html/math_toolkit/gd_opt/introduction.html b/doc/html/math_toolkit/gd_opt/introduction.html index a3a301e17..dd3794f43 100644 --- a/doc/html/math_toolkit/gd_opt/introduction.html +++ b/doc/html/math_toolkit/gd_opt/introduction.html @@ -7,7 +7,7 @@ - + @@ -28,13 +28,713 @@ Introduction- Gradient based optimizers are algorithms that use the gradient of a funciton + Gradient based optimizers are algorithms that use the gradient of a function to iteratively find locally extreme points of functions over a set of parameters. This sections provides a description of a set of gradient optimizers. The optimizers are written with
+boost::math::differentiation::reverse_mode::rvarin mind, however if a way to evaluate the funciton and its gradient is provided, the optimizers should work in exactly the same way.+ Below is a table that summarizes the intended usage patterns of the provided + optimizers and policies, and is meant as a practical guide rather than a + strict prescription: +
++ + List + of Optimizers +
+++
+ ++ + + + + + + + ++ ++ Optimizer +
++ ++ Order +
++ ++ Uses Curvature +
++ ++ Memory Cost +
++ ++ Intended Problem Class +
++ ++ When to Use +
++ ++ ++ gradient descent +
++ ++ first +
++ ++ no +
++ ++ low +
++ ++ Smooth, well-scaled objectives +
++ ++ Baseline method; debugging; when behavior transparency matters +
++ ++ ++ nesterov accelerated gradient +
++ ++ first +
++ ++ no +
++ ++ low +
++ ++ Ill-conditioned or narrow-valley problems +
++ ++ When plain gradient descent converges slowly or oscillates +
++ + ++ ++ L-BFGS +
++ ++ quasi second order +
++ ++ approximate +
++ ++ medium +
++ ++ Smooth, deterministic objectives +
++ ++ When gradients are reliable and faster convergence is needed +
++ + Optimizer + Policies +
++ + Initialization + Policies +
+++
+ ++ + + + + ++ ++ Policy +
++ ++ Use case +
++ ++ Responsibilities +
++ ++ ++ tape_initializer_rvar +
++ ++ User initialzes all varibles manually +
++ ++ initializes tape +
++ ++ ++ random_uniform_initializer_rvar +
++ ++ Initializes all variables with a random number between a min and + max value +
++ ++ Initializes variables. Initializes tape. +
++ + ++ ++ costant_initializer_rvar +
++ ++ Initializes all variables with a constant +
++ ++ Initializes variables. Initializes tape. +
++ + Evaluation + Policies +
+++
+ ++ + + + + ++ ++ Policy +
++ ++ Use case +
++ ++ Responsibilities +
++ ++ ++ reverse_mode_function_eval_policy +
++ ++ Default. User with boost reverse mode autodiff +
++ ++ tells the optimizer how to evaluate the objective +
++ + ++ ++ reverse_mode_gradient_evaluation_policy +
++ ++ Default. User with boost reverse mode autodiff +
++ ++ tells the optimizer how to evaluate the gradients of an objective +
++ These policies are intended to use with boost reverse mode autodiff. If you + need to use the optimizers with a custom AD variable, or by providing the + gradient of an objective manually, check the docs for policies to see how + the policies are implemented. +
++ + LBFGS line + search policies +
++ the table below summarizes the two line search policies provided for use + with LBFGS. +
+++
+ ++ + + + + + + ++ ++ Policy +
++ ++ Enforced Conditions +
++ ++ Per iteration cost +
++ ++ Convergence +
++ ++ Use case +
++ ++ ++ Strong Wolfe +
++ ++ function decrease. curvature condition +
++ ++ higher +
++ ++ faster +
++ ++ most of the time +
++ + ++ ++ Armijo +
++ ++ function decrease only +
++ ++ lower +
++ ++ slower +
++ ++ you know what you're doing +
++ + Minimizer + Policies +
++ + Convergence + Policies +
+++
+ ++ + + + + ++ ++ Policy +
++ ++ Criterion +
++ ++ When to Use +
++ ++ ++ gradient_norm_convergence_policy +
++ ++ gradient norm < tol +
++ ++ Default. Stationarity based condition +
++ ++ ++ objective_tol_convergence_policy +
++ ++ absolute difference between objective steps is small +
++ ++ Well-scaled objectives +
++ ++ ++ relative_objective_tol_policy +
++ ++ relative difference between objective steps is small +
++ ++ Scale-invariant convergence +
++ + ++ ++ combined_convergence_policy +
++ ++ logical combination OR +
++ ++ you need a combination of convergence conditions +
++ + Termination + Policies +
+++
+ ++ + + + + ++ ++ Policy +
++ ++ Controls +
++ ++ When to Use +
++ ++ ++ max_iter_termination_policy +
++ ++ iteration count +
++ ++ Hard safety bound (almost always recommended) +
++ + ++ ++ wallclock_termination_policy +
++ ++ wall clock time +
++ ++ benchmarking, real-time constraints +
++ + Constraint + and Projection Policies +
++ the step length alpha along a search direction. Determines the acceptance + criteria and how many function/gradient evaluations may be performed + during a step Default is Strong Wolfe, but Armijo is an option. Strong + Wolfe uses both function and gradient information to ensure good curvature + conditions, while Armijo relies only on function decrease and is simpler + but less robust for quasi-Newton methods. ++
+ ++ + + + ++ ++ Policy +
++ ++ Constraint Type +
++ ++ ++ unconstrained_policy +
++ ++ No constraint +
++ ++ ++ box_constraints +
++ ++ upper/lower bound clip +
++ ++ ++ nonnegativity_constraint +
++ ++ set everything below 0, to 0 +
++ ++ ++ l2_ball_constraint +
++ ++ 2-norm(x) < r +
++ ++ ++ l1_ball_constraint +
++ ++ 1-norm(x) < r +
++ ++ ++ simplex_constraint +
++ ++ Probability simplex +
++ ++ ++ function_constraint +
++ ++ custom user provided function wrapper +
++ + ++ ++ unit_sphere_constraint +
++ ++ 2-norm(x) = 1 +
+- Notes -
-+ Notes + +
- LBFGS assumes the objective is sufficiently smooth for gradients to be informative. It is typically most effective on unconstrained smooth problems. diff --git a/doc/html/math_toolkit/gd_opt/nesterov.html b/doc/html/math_toolkit/gd_opt/nesterov.html index a77ca1a50..8152ed5d4 100644 --- a/doc/html/math_toolkit/gd_opt/nesterov.html +++ b/doc/html/math_toolkit/gd_opt/nesterov.html @@ -1,12 +1,12 @@ -
Nesterov Accelerated Gradient Desccent +Nesterov Accelerated Gradient Descent - + @@ -25,8 +25,8 @@@@ -78,7 +78,7 @@ }; /* Convenience overloads */ -/* make nesterov acelerated gradient descent by providing +/* make nesterov accelerated gradient descent by providing ** objective function ** variables to optimize over ** Optionally @@ -106,7 +106,7 @@ InitializationPolicy&& ip); /* provide - * initilaization policy + * initialization policy * objective evaluation policy * gradient evaluation policy */ @@ -175,7 +175,7 @@
RealType lr: learning rate. Larger values take larger steps (faster but potentially - unsable). Smaller values are more stable but converge more slowly. + unstable). Smaller values are more stable but converge more slowly.RealType mu@@ -184,10 +184,15 @@InitializationPolicy&& ip- : initialization policy for the optimizer state and variables. For NAG, - this also initializes the internal momentum/velocity state. By default - the optimizer uses the same initializer as gradient descent and initializes - velocity to zero. + : Initialization policy for optimizer state and variables. Users may + supply a custom initialization policy to control how the argument container + and any AD-specific runtime state : i.e. reverse-mode tape attachment/reset + are initialized. By default, the optimizer uses the same initialization + as gradient descent, taking the user provided initial values in x and + initializing the internal momentum/velocity state to zero. Custom initialization + policies are useful for randomized starts, non rvar AD types, or when + gradients are supplied externally. Check the docs for Reverse Mode autodiff + policies for initialization policy structure to write custom policies.ObjectiveEvalPolicy&& diff --git a/doc/html/optimization.html b/doc/html/optimization.html index b7f7ef96e..56db63645 100644 --- a/doc/html/optimization.html +++ b/doc/html/optimization.html @@ -37,9 +37,9 @@Gradient Based Optimizers L-BFGS minimize Reverse Mode autodiff policies diff --git a/doc/optimization/gradient_optimizers.qbk b/doc/optimization/gradient_optimizers.qbk index 9448ed751..f9c0001bd 100644 --- a/doc/optimization/gradient_optimizers.qbk +++ b/doc/optimization/gradient_optimizers.qbk @@ -13,6 +13,70 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) [section:introduction Introduction] Gradient based optimizers are algorithms that use the gradient of a function to iteratively find locally extreme points of functions over a set of parameters. This sections provides a description of a set of gradient optimizers. The optimizers are written with `boost::math::differentiation::reverse_mode::rvar` in mind, however if a way to evaluate the funciton and its gradient is provided, the optimizers should work in exactly the same way. +Below is a table that summarizes the intended usage patterns of the provided optimizers and policies, and is meant as a practical guide rather than a strict prescription: + +[h1:table-optimizers List of Optimizers] +[table + [[Optimizer] [Order] [Uses Curvature] [Memory Cost] [Intended Problem Class] [When to Use]] + [[gradient descent] [first] [no] [low] [Smooth, well-scaled objectives] [Baseline method; debugging; when behavior transparency matters]] + [[nesterov accelerated gradient] [first] [no] [low] [Ill-conditioned or narrow-valley problems] [When plain gradient descent converges slowly or oscillates]] + [[L-BFGS] [quasi second order] [approximate] [medium] [Smooth, deterministic objectives] [When gradients are reliable and faster convergence is needed]] +] + +[h1:table-optimizer-policies Optimizer Policies] +[heading Initialization Policies] +[table + [[Policy] [Use case] [Responsibilities]] + [[tape_initializer_rvar] [User initialzes all varibles manually] [initializes tape]] + [[random_uniform_initializer_rvar] [Initializes all variables with a random number between a min and max value] [Initializes variables. Initializes tape.]] + [[costant_initializer_rvar] [Initializes all variables with a constant] [Initializes variables. Initializes tape.]] +] + +[heading Evaluation Policies] +[table + [[Policy] [Use case] [Responsibilities]] + [[reverse_mode_function_eval_policy] [Default. User with boost reverse mode autodiff] [tells the optimizer how to evaluate the objective]] + [[reverse_mode_gradient_evaluation_policy] [Default. User with boost reverse mode autodiff] [tells the optimizer how to evaluate the gradients of an objective]] +] + +These policies are intended to use with boost reverse mode autodiff. If you need to use the optimizers with a custom AD variable, or by providing the gradient of an objective manually, check the docs for policies to see how the policies are implemented. + +[h1:line-search-policies LBFGS line search policies] +the table below summarizes the two line search policies provided for use with LBFGS. +[table + [[Policy] [Enforced Conditions] [Per iteration cost] [Convergence] [Use case]] + [[Strong Wolfe] [function decrease. curvature condition] [higher] [faster] [most of the time]] + [[Armijo] [function decrease only] [lower] [slower] [you know what you're doing]] +] +[h1:minimizer-policies Minimizer Policies] +[heading Convergence Policies] +[table + [[Policy] [Criterion] [When to Use]] + [[gradient_norm_convergence_policy] [gradient norm < tol] [Default. Stationarity based condition]] + [[objective_tol_convergence_policy] [absolute difference between objective steps is small] [Well-scaled objectives]] + [[relative_objective_tol_policy] [relative difference between objective steps is small] [Scale-invariant convergence]] + [[combined_convergence_policy] [logical combination OR] [you need a combination of convergence conditions]] +] + +[heading Termination Policies] +[table + [[Policy] [Controls] [When to Use]] + [[max_iter_termination_policy] [iteration count] [Hard safety bound (almost always recommended)]] + [[wallclock_termination_policy] [wall clock time] [benchmarking, real-time constraints]] +] + +[heading Constraint and Projection Policies] +[table + [[Policy] [Constraint Type]] + [[unconstrained_policy] [No constraint]] + [[box_constraints] [upper/lower bound clip]] + [[nonnegativity_constraint] [set everything below 0, to 0]] + [[l2_ball_constraint] [ 2-norm(x) < r]] + [[l1_ball_constraint] [ 1-norm(x) < r]] + [[simplex_constraint] [Probability simplex]] + [[function_constraint] [custom user provided function wrapper]] + [[unit_sphere_constraint] [2-norm(x) = 1]] +] [endsect] [/section:introduction] [section:gradient_descent Gradient Descent] diff --git a/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp b/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp index a00a4c1d6..8fd3c2a08 100644 --- a/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp +++ b/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp @@ -5,7 +5,6 @@ #ifndef BOOST_MATH_OPTIMIZATION_DETAIL_DIFFERENTIABLE_OPT_UTILITIES_HPP #define BOOST_MATH_OPTIMIZATION_DETAIL_DIFFERENTIABLE_OPT_UTILITIES_HPP #include-#include #include #include #include @@ -144,11 +143,11 @@ random_vector(size_t n) /** @brief> generates a random std::vector of size n * using mt19937 algorithm */ - static boost::random::mt19937 rng{ std::random_device{}() }; - static boost::random::uniform_real_distribution dist(0.0, 1.0); + static std::mt19937 rng{ std::random_device{}() }; + static std::uniform_real_distribution dist(0.0, 1.0); std::vector result(n); - std::generate(result.begin(), result.end(), [&] { return dist(rng); }); + std::generate(result.begin(), result.end(), [&] { return static_cast (dist(rng)); }); return result; } diff --git a/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp b/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp index dbb365b5e..b4cad4b00 100644 --- a/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp +++ b/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp @@ -8,7 +8,6 @@ #include #include #include -#include namespace boost { namespace math { @@ -102,10 +101,10 @@ struct random_uniform_initializer_rvar template void operator()(ArgumentContainer& x) const { - static boost::random::mt19937 gen{ std::random_device{}() }; - static boost::random::uniform_real_distribution dist(0.0, 1.0); + static std::mt19937 gen{ std::random_device{}() }; + static std::uniform_real_distribution dist(0.0, 1.0); for (auto& xi : x) { - xi = rdiff::rvar (dist(gen)); + xi = rdiff::rvar (static_cast (dist(gen))); } auto& tape = rdiff::get_active_tape (); tape.add_checkpoint();