diff --git a/doc/optimization/gradient_optimizers.qbk b/doc/optimization/gradient_optimizers.qbk
index 1977021d4..9448ed751 100644
--- a/doc/optimization/gradient_optimizers.qbk
+++ b/doc/optimization/gradient_optimizers.qbk
@@ -99,7 +99,7 @@ The interface is intended to be pytorch-like, where a optimizer object is constr
 * `Objective&& obj` : objective function to minimize
 * `ArgumentContainer& x` : variables to optimize over
 * `RealType& lr` : learning rate. A larger value takes larger steps during descent, leading to faster, but more unstable convergence. Conversely, small values are more stable but take longer to converge.
-* `InitializationPolicy&& ip` : Initialization policy for `ArgumentContainer`, or the initial guess. By default it is set to `tape_initializer_rvar<RealType>` which lets the user provide the "initial guess" by setting the values of `x` manually. For more info check the Policies section.
+* `InitializationPolicy&& ip` : Initialization policy for optimizer state and variables. Users may supply a custom initialization policy to control how the argument container and any AD specific runtime state : i.e. reverse-mode tape attachment/reset are initialized. By default, the optimizer uses the user-provided initial values in x and performs the standard reverse mode AD initialization required for gradient evaluation. Custom initialization policies are useful for randomized starts, non rvar AD types, or when gradients are supplied externally. See the reverse-mode autodiff policy documentation for the required initialization policy interface when writing custom policies.
 * `ObjectiveEvalPolicy&& oep` : tells the optimizer how to evaluate the objective function. By default `reverse_mode_function_eval_policy<RealType>`.
 * `GradEvalPolicy&& gep` : tells the optimizer how to evaluate the gradient of the objective function. By default `reverse_mode_gradient_evaluation_policy<RealType>`
 
@@ -415,7 +415,7 @@ Setting mu = 0 reduces NAG to standard gradient descent.
 * `ArgumentContainer& x` : variables to optimize over. Updated in place.
 * `RealType lr` : learning rate. Larger values take larger steps (faster but potentially unstable). Smaller values are more stable but converge more slowly.
 * `RealType mu` : momentum coefficient in `[0,1)`. Higher values, e.g. 0.9 to 0.99, typically accelerate convergence but may require a smaller `lr`
-* `InitializationPolicy&& ip` : initialization policy for the optimizer state and variables. For NAG, this also initializes the internal momentum/velocity state. By default the optimizer uses the same initializer as gradient descent and initializes velocity to zero.
+* `InitializationPolicy&& ip` : Initialization policy for optimizer state and variables. Users may supply a custom initialization policy to control how the argument container and any AD-specific runtime state : i.e. reverse-mode tape attachment/reset are initialized. By default, the optimizer uses the same initialization as gradient descent, taking the user provided initial values in x and initializing the internal momentum/velocity state to zero. Custom initialization policies are useful for randomized starts, non rvar AD types, or when gradients are supplied externally. Check the docs for Reverse Mode autodiff policies for initialization policy structure to write custom policies.
 * `ObjectiveEvalPolicy&& oep` : objective evaluation policy. By default `reverse_mode_function_eval_policy<RealType>`
 * `GradEvalPolicy&& gep` : gradient evaluation policy. By default `reverse_mode_gradient_evaluation_policy<RealType>`
 
@@ -700,11 +700,10 @@ The line search is a key part of practical LBFGS: it typically removes the need
 * `Objective&& obj` : objective function to minimize.
 * `ArgumentContainer& x` : variables to optimize over. Updated in-place.
 * `std::size_t m` : history size. Typical values are 5–20. Default is 10. Larger m can improve directions but increases memory and per-step cost.
-* `InitializationPolicy&& ip` : initialization policy for ArgumentContainer and optimizer state. For reverse-mode AD, the default typically initializes/attaches the tape and uses the user-provided initial values in x.
+* `InitializationPolicy&& ip` : Initialization policy for optimizer state and variables. Users may supply a custom initialization policy to control how the argument container and any AD-specific runtime state : i.e. reverse-mode tape attachment/reset are initialized. By default, the optimizer uses the same initialization as gradient descent, taking the user provided initial values in x and initializing the internal momentum/velocity state to zero. Custom initialization policies are useful for randomized starts, non rvar AD types, or when gradients are supplied externally. Check the docs for Reverse Mode autodiff policies for initialization policy structure to write custom policies.
 * `ObjectiveEvalPolicy&& oep` : policy for evaluating the objective function value at a given x. By default this is a reverse-mode AD evaluation policy when using `rvar`.
 * `GradEvalPolicy&& gep` : policy for evaluating the gradient of the objective. By default this is a reverse-mode AD gradient evaluation policy when using `rvar`.
-* `LineSearchPolicy&& lsp` : policy for selecting the step length alpha. Default is Strong Wolfe, but Armijo is an option. 
-
+* `LineSearchPolicy&& lsp` : policy for selecting the step length alpha along a search direction. Determines the acceptance criteria and how many function/gradient evaluations may be performed during a step Default is Strong Wolfe, but Armijo is an option. Strong Wolfe uses both function and gradient information to ensure good curvature conditions, while Armijo relies only on function decrease and is simpler but less robust for quasi-Newton methods.
 [heading Notes]
 
 * LBFGS assumes the objective is sufficiently smooth for gradients to be informative. It is typically most effective on unconstrained smooth problems.
diff --git a/example/.gradient_descent_example.cpp.swp b/example/.gradient_descent_example.cpp.swp
new file mode 100644
index 000000000..ab6c20b70
Binary files /dev/null and b/example/.gradient_descent_example.cpp.swp differ
diff --git a/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp b/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp
index 9e7e989ef..a00a4c1d6 100644
--- a/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp
+++ b/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp
@@ -2,11 +2,10 @@
 // Distributed under the Boost Software License, Version 1.0.
 //      (See accompanying file LICENSE_1_0.txt or copy at
 //           https://www.boost.org/LICENSE_1_0.txt)
-#ifndef DIFFERENTIABLE_OPT_UTILITIES_HPP
-#define DIFFERENTIABLE_OPT_UTILITIES_HPP
+#ifndef BOOST_MATH_OPTIMIZATION_DETAIL_DIFFERENTIABLE_OPT_UTILITIES_HPP
+#define BOOST_MATH_OPTIMIZATION_DETAIL_DIFFERENTIABLE_OPT_UTILITIES_HPP
 #include <boost/math/differentiation/autodiff_reverse.hpp>
-#include <boost/random/mersenne_twister.hpp>
-#include <boost/random/uniform_real_distribution.hpp>
+#include <boost/random.hpp>
 #include <cmath>
 #include <random>
 #include <type_traits>
@@ -70,7 +69,6 @@ struct argument_container_t<Container<rdiff::rvar<RealType, N>, Args...>>
 };
 /******************************************************************************/
 /** @brief simple blas helpers
- * may optimize later if benchmarks show its needed, or just switch to Eigen
  */
 template<typename Container>
 auto
@@ -146,12 +144,6 @@ random_vector(size_t n)
   /** @brief> generates a random std::vector<RealType> of size n
    * using mt19937 algorithm
    */
-
-  /** TODO: these may need to be marked thread local
-   * in the future
-   *
-   * TODO: benchmark.
-   */
   static boost::random::mt19937 rng{ std::random_device{}() };
   static boost::random::uniform_real_distribution<RealType> dist(0.0, 1.0);
 
diff --git a/include/boost/math/optimization/detail/gradient_opt_base.hpp b/include/boost/math/optimization/detail/gradient_opt_base.hpp
index 58fd3ac2b..e13b79279 100644
--- a/include/boost/math/optimization/detail/gradient_opt_base.hpp
+++ b/include/boost/math/optimization/detail/gradient_opt_base.hpp
@@ -2,8 +2,8 @@
 // Distributed under the Boost Software License, Version 1.0.
 //      (See accompanying file LICENSE_1_0.txt or copy at
 //           https://www.boost.org/LICENSE_1_0.txt)
-#ifndef GRADIENT_OPT_BASE_HPP
-#define GRADIENT_OPT_BASE_HPP
+#ifndef BOOST_MATH_OPTIMIZATION_DETAIL_GRADIENT_OPT_BASE_HPP
+#define BOOST_MATH_OPTIMIZATION_DETAIL_GRADIENT_OPT_BASE_HPP
 #include <boost/math/differentiation/autodiff_reverse.hpp>
 
 namespace boost {
diff --git a/include/boost/math/optimization/detail/line_search_policies.hpp b/include/boost/math/optimization/detail/line_search_policies.hpp
index 0dd85867b..1bc3c1cc4 100644
--- a/include/boost/math/optimization/detail/line_search_policies.hpp
+++ b/include/boost/math/optimization/detail/line_search_policies.hpp
@@ -3,13 +3,11 @@
 //      (See accompanying file LICENSE_1_0.txt or copy at
 //           https://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef LINE_SEARCH_POLICIES_HPP
-#define LINE_SEARCH_POLICIES_HPP
+#ifndef BOOST_MATH_OPTIMIZATION_DETAIL_LINE_SEARCH_POLICIES_HPP
+#define BOOST_MATH_OPTIMIZATION_DETAIL_LINE_SEARCH_POLICIES_HPP
 
 #include <boost/math/optimization/detail/differentiable_opt_utilties.hpp>
 #include <cmath>
-#include <iostream>
-#include <numeric>
 #include <vector>
 namespace boost {
 namespace math {
diff --git a/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp b/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp
index 5258728bd..dbb365b5e 100644
--- a/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp
+++ b/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp
@@ -2,13 +2,14 @@
 // Distributed under the Boost Software License, Version 1.0.
 //      (See accompanying file LICENSE_1_0.txt or copy at
 //           https://www.boost.org/LICENSE_1_0.txt)
-#ifndef RDIFF_OPTIMIZATION_POLICIES_HPP__
-#define RDIFF_OPTIMIZATION_POLICIES_HPP__
+#ifndef BOOST_MATH_OPTIMIZATION_DETAIL_RDIFF_OPTIMIZATION_POLICIES_HPP
+#define BOOST_MATH_OPTIMIZATION_DETAIL_RDIFF_OPTIMIZATION_POLICIES_HPP
 
 #include <boost/math/differentiation/autodiff_reverse.hpp>
-#include <boost/random.hpp>
 #include <random>
 #include <type_traits>
+#include <boost/random.hpp>
+
 namespace boost {
 namespace math {
 namespace optimization {
@@ -101,8 +102,8 @@ struct random_uniform_initializer_rvar
   template<class ArgumentContainer>
   void operator()(ArgumentContainer& x) const
   {
-    boost::random::mt19937 gen(seed_);
-    boost::random::uniform_real_distribution<RealType> dist(low_, high_);
+    static boost::random::mt19937 gen{ std::random_device{}() };
+    static boost::random::uniform_real_distribution<RealType> dist(0.0, 1.0);
     for (auto& xi : x) {
       xi = rdiff::rvar<RealType, 1>(dist(gen));
     }
diff --git a/include/boost/math/optimization/gradient_descent.hpp b/include/boost/math/optimization/gradient_descent.hpp
index f88f464c8..68972e296 100644
--- a/include/boost/math/optimization/gradient_descent.hpp
+++ b/include/boost/math/optimization/gradient_descent.hpp
@@ -2,12 +2,11 @@
 // Distributed under the Boost Software License, Version 1.0.
 //      (See accompanying file LICENSE_1_0.txt or copy at
 //           https://www.boost.org/LICENSE_1_0.txt)
-#ifndef GRADIENT_DESCENT_HPP
-#define GRADIENT_DESCENT_HPP
+#ifndef BOOST_MATH_OPTIMIZATION_GRADIENT_DESCENT_HPP
+#define BOOST_MATH_OPTIMIZATION_GRADIENT_DESCENT_HPP
 #include <boost/math/optimization/detail/differentiable_opt_utilties.hpp>
 #include <boost/math/optimization/detail/gradient_opt_base.hpp>
 #include <boost/math/optimization/detail/rdiff_optimization_policies.hpp>
-namespace rdiff = boost::math::differentiation::reverse_mode;
 
 namespace boost {
 namespace math {
@@ -26,10 +25,6 @@ struct gradient_descent_update_policy
                ArgumentType>::value>::type>
   void operator()(ArgumentType& x, RealType& g)
   {
-    // this update effectively "mutes" the tape
-    // TODO: add a tape scope guard method so that
-    // you can do math on autodiff types without
-    // accumulating gradients
     x.get_value() -= lr_ * g;
   }
   template<typename ArgumentType,
diff --git a/include/boost/math/optimization/gradient_optimizers.hpp b/include/boost/math/optimization/gradient_optimizers.hpp
index 53fdbd667..f9e6a8de4 100644
--- a/include/boost/math/optimization/gradient_optimizers.hpp
+++ b/include/boost/math/optimization/gradient_optimizers.hpp
@@ -2,8 +2,8 @@
 // Distributed under the Boost Software License, Version 1.0.
 //      (See accompanying file LICENSE_1_0.txt or copy at
 //           https://www.boost.org/LICENSE_1_0.txt)
-#ifndef GRADIENT_OPTIMIZERS_HPP
-#define GRADIENT_OPTIMIZERS_HPP
+#ifndef BOOST_MATH_OPTIMIZATION_GRADIENT_OPTIMIZERS_HPP
+#define BOOST_MATH_OPTIMIZATION_GRADIENT_OPTIMIZERS_HPP
 #include <boost/math/differentiation/autodiff_reverse.hpp>
 #include <boost/math/optimization/gradient_descent.hpp>
 #include <boost/math/optimization/lbfgs.hpp>
diff --git a/include/boost/math/optimization/lbfgs.hpp b/include/boost/math/optimization/lbfgs.hpp
index 57b70d21a..5554b7721 100644
--- a/include/boost/math/optimization/lbfgs.hpp
+++ b/include/boost/math/optimization/lbfgs.hpp
@@ -2,22 +2,20 @@
 // Distributed under the Boost Software License, Version 1.0.
 //      (See accompanying file LICENSE_1_0.txt or copy at
 //           https://www.boost.org/LICENSE_1_0.txt)
-#ifndef LBFGS_HPP
-#define LBFGS_HPP
+#ifndef BOOST_MATH_OPTIMIZATION_LBFGS_HPP
+#define BOOST_MATH_OPTIMIZATION_LBFGS_HPP
 #include <boost/math/optimization/detail/differentiable_opt_utilties.hpp>
 #include <boost/math/optimization/detail/gradient_opt_base.hpp>
 #include <boost/math/optimization/detail/rdiff_optimization_policies.hpp>
 #include <vector>
 
-#include "boost/math/optimization/detail/line_search_policies.hpp"
+#include <boost/math/optimization/detail/line_search_policies.hpp>
 #include <deque>
 
 namespace boost {
 namespace math {
 namespace optimization {
 
-namespace rdiff = boost::math::differentiation::reverse_mode;
-
 /** @brief> Helper struct for L-BFGS
  *
  * stores state of L-BFGS optimizer
diff --git a/include/boost/math/optimization/minimizer.hpp b/include/boost/math/optimization/minimizer.hpp
index d79059b25..929c92b2f 100644
--- a/include/boost/math/optimization/minimizer.hpp
+++ b/include/boost/math/optimization/minimizer.hpp
@@ -2,11 +2,12 @@
 // Distributed under the Boost Software License, Version 1.0.
 //      (See accompanying file LICENSE_1_0.txt or copy at
 //           https://www.boost.org/LICENSE_1_0.txt)
-#ifndef MINIMIZER_HPP
-#define MINIMIZER_HPP
+#ifndef BOOST_MATH_OPTIMIZATION_MINIMIZER_HPP
+#define BOOST_MATH_OPTIMIZATION_MINIMIZER_HPP
 #include <boost/math/optimization/detail/differentiable_opt_utilties.hpp>
 #include <boost/math/optimization/gradient_optimizers.hpp>
 #include <vector>
+#include <chrono>
 namespace boost {
 namespace math {
 namespace optimization {
diff --git a/include/boost/math/optimization/nesterov.hpp b/include/boost/math/optimization/nesterov.hpp
index efd13a502..1c882f401 100644
--- a/include/boost/math/optimization/nesterov.hpp
+++ b/include/boost/math/optimization/nesterov.hpp
@@ -2,8 +2,8 @@
 // Distributed under the Boost Software License, Version 1.0.
 //      (See accompanying file LICENSE_1_0.txt or copy at
 //           https://www.boost.org/LICENSE_1_0.txt)
-#ifndef NESTEROV_HPP
-#define NESTEROV_HPP
+#ifndef BOOST_MATH_OPTIMIZATION_NESTEROV_HPP
+#define BOOST_MATH_OPTIMIZATION_NESTEROV_HPP
 #include <boost/math/optimization/detail/differentiable_opt_utilties.hpp>
 #include <boost/math/optimization/detail/gradient_opt_base.hpp>
 #include <boost/math/optimization/detail/rdiff_optimization_policies.hpp>
diff --git a/test/test_gradient_descent_optimizer.cpp b/test/test_gradient_descent_optimizer.cpp
index bc9e029f0..d44a89b98 100644
--- a/test/test_gradient_descent_optimizer.cpp
+++ b/test/test_gradient_descent_optimizer.cpp
@@ -60,7 +60,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(random_initializer_test, T, all_float_types)
                                 x,
                                 lr,
                                 bopt::random_uniform_initializer_rvar<T>(
-                                  -2.0, 2.0, 1234)); // all initialized to 5
+                                  T(-2.0), T(2.0), 1234));
   for (auto& xi : x) {
     T v = xi.item();
     BOOST_TEST(v >= -2);