From e2ce13e3fb285fd0a2edc986017fdf19f84650c4 Mon Sep 17 00:00:00 2001
From: mzhelyez <maksym.zhelyeznyakov@planopsim.com>
Date: Sat, 4 Oct 2025 17:28:01 +0200
Subject: [PATCH] saving progress on linux machine

---
 .../math/differentiation/autodiff_reverse.hpp |   2 +
 ...everse_mode_autodiff_memory_management.hpp |   2 +-
 .../detail/differentiable_opt_utilties.hpp    |  80 +++++
 .../optimization/detail/gradient_opt_base.hpp |  72 ++++
 .../detail/rdiff_optimization_policies.hpp    | 140 ++++++++
 .../math/optimization/gradient_descent.hpp    | 143 ++++++++
 .../math/optimization/gradient_optimizers.hpp |  12 +
 include/boost/math/optimization/minimizer.hpp | 321 +++++++++++++++++
 include/boost/math/optimization/nesterov.hpp  |  15 +
 test/test_functions_for_optimization.hpp      |  10 +
 test/test_gradient_descent_optimizer.cpp      | 328 ++++++++++++++++++
 11 files changed, 1124 insertions(+), 1 deletion(-)
 create mode 100644 include/boost/math/optimization/detail/differentiable_opt_utilties.hpp
 create mode 100644 include/boost/math/optimization/detail/gradient_opt_base.hpp
 create mode 100644 include/boost/math/optimization/detail/rdiff_optimization_policies.hpp
 create mode 100644 include/boost/math/optimization/gradient_descent.hpp
 create mode 100644 include/boost/math/optimization/gradient_optimizers.hpp
 create mode 100644 include/boost/math/optimization/minimizer.hpp
 create mode 100644 include/boost/math/optimization/nesterov.hpp
 create mode 100644 test/test_gradient_descent_optimizer.cpp
diff --git a/include/boost/math/differentiation/autodiff_reverse.hpp b/include/boost/math/differentiation/autodiff_reverse.hpp
index 57584a765..4670b1d13 100644
--- a/include/boost/math/differentiation/autodiff_reverse.hpp
+++ b/include/boost/math/differentiation/autodiff_reverse.hpp
@@ -580,6 +580,8 @@ public:
         }
         it->backward();
     }
+
+    void set_item(RealType value) { value_ = inner_t(value); }
 };
 
 template<typename RealType, size_t DerivativeOrder>
diff --git a/include/boost/math/differentiation/detail/reverse_mode_autodiff_memory_management.hpp b/include/boost/math/differentiation/detail/reverse_mode_autodiff_memory_management.hpp
index b0b5ab055..843e3f969 100644
--- a/include/boost/math/differentiation/detail/reverse_mode_autodiff_memory_management.hpp
+++ b/include/boost/math/differentiation/detail/reverse_mode_autodiff_memory_management.hpp
@@ -266,7 +266,7 @@ public:
     void add_checkpoint()
     {
         if (total_size_ > 0) {
-            checkpoints_.push_back(total_size_ - 1);
+            checkpoints_.push_back(total_size_); //- 1);
         } else {
             checkpoints_.push_back(0);
         }
diff --git a/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp b/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp
new file mode 100644
index 000000000..32093647b
--- /dev/null
+++ b/include/boost/math/optimization/detail/differentiable_opt_utilties.hpp
@@ -0,0 +1,80 @@
+#ifndef DIFFERENTIABLE_OPT_UTILITIES_HPP
+#define DIFFERENTIABLE_OPT_UTILITIES_HPP
+#include <cmath>
+#include <random>
+#include <type_traits>
+#include <vector>
+
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include <vector>
+
+namespace boost {
+namespace math {
+namespace optimization {
+template<typename UpdPol>
+struct update_policy_real_type;
+
+template<typename UpdPol>
+struct update_policy_real_type;
+
+template<template<typename> class UpdPol, typename RealType>
+struct update_policy_real_type<UpdPol<RealType>>
+{
+    using type = RealType;
+};
+
+template<typename UpdPol>
+using update_policy_real_type_t =
+    typename update_policy_real_type<typename std::decay<UpdPol>::type>::type;
+
+template<class RealType>
+RealType gradient_norm2(const std::vector<std::reference_wrapper<RealType>>& g)
+{
+    /* @brief computes 2 norm of a vector of reference wrapped RealTypes
+	*/
+    RealType sum = RealType(0);
+    for (auto& gi : g) {
+        RealType val = gi.get();
+        sum += val * val;
+    }
+    return sqrt(sum);
+}
+
+template<class RealType>
+RealType gradient_norm1(const std::vector<std::reference_wrapper<RealType>>& g)
+{
+    /* @brief computes 2 norm of a vector of reference wrapped RealTypes
+	*/
+    RealType sum = RealType(0);
+    for (auto& gi : g) {
+        RealType val = gi.get();
+        sum += abs(val);
+    }
+    return sqrt(sum);
+}
+
+template<typename RealType>
+std::vector<RealType> random_vector(size_t n)
+{
+    /*@brief> generates a random std::vector<RealType> of size n
+     * using mt19937 algorithm
+     */
+
+    /* TODO: these may need to be marked thread local 
+     * in the future
+	* 
+	* TODO: benchmark.
+	*/
+    static boost::random::mt19937                             rng{std::random_device{}()};
+    static boost::random::uniform_real_distribution<RealType> dist(0.0, 1.0);
+
+    std::vector<RealType> result(n);
+    std::generate(result.begin(), result.end(), [&] { return dist(rng); });
+    return result;
+}
+
+} // namespace optimization
+} // namespace math
+} // namespace boost
+#endif
diff --git a/include/boost/math/optimization/detail/gradient_opt_base.hpp b/include/boost/math/optimization/detail/gradient_opt_base.hpp
new file mode 100644
index 000000000..62d06d206
--- /dev/null
+++ b/include/boost/math/optimization/detail/gradient_opt_base.hpp
@@ -0,0 +1,72 @@
+#ifndef GRADIENT_OPT_BASE_HPP
+#define GRADIENT_OPT_BASE_HPP
+#include <boost/math/differentiation/autodiff_reverse.hpp>
+
+namespace boost {
+namespace math {
+namespace optimization {
+
+namespace rdiff = boost::math::differentiation::reverse_mode;
+
+template<typename ArgumentContainer,
+         typename RealType,
+         class Objective,
+         class InitializationPolicy,
+         class ObjectiveEvalPolicy,
+         class GradEvalPolicy,
+         class UpdatePolicy,
+         typename DerivedOptimizer>
+class abstract_optimizer
+{
+private:
+    Objective                                     objective_; // obj function
+    ArgumentContainer&                            x_;         // arguments to objective function
+    std::vector<std::reference_wrapper<RealType>> g_;        // container of references to gradients
+    ObjectiveEvalPolicy                           obj_eval_; // how to evaluate your funciton
+    GradEvalPolicy                                grad_eval_; // how to evaluate/bind gradients
+    InitializationPolicy                          init_;      // how to initialize the problem
+    UpdatePolicy                                  update_;    // update step
+    RealType                                      obj_v_;     // objective value (for history)
+    // access derived class
+    DerivedOptimizer&       derived() { return static_cast<DerivedOptimizer&>(*this); }
+    const DerivedOptimizer& derived() const { return static_cast<const DerivedOptimizer&>(*this); }
+
+public:
+    using argument_container_t = ArgumentContainer;
+    using real_type_t          = RealType;
+
+    abstract_optimizer(Objective&&            objective,
+                       ArgumentContainer&     x,
+                       InitializationPolicy&& ip,
+                       ObjectiveEvalPolicy&&  oep,
+                       GradEvalPolicy&&       gep,
+                       UpdatePolicy&&         up)
+        : objective_(std::forward<Objective>(objective))
+        , x_(x)
+        , obj_eval_(std::forward<ObjectiveEvalPolicy>(oep))
+        , grad_eval_(std::forward<GradEvalPolicy>(gep))
+        , init_(std::forward<InitializationPolicy>(ip))
+        , update_(std::forward<UpdatePolicy>(up))
+    {
+        init_(x_);               // initialize your problem
+        grad_eval_.bind(x_, g_); // bind gradients to g_ container
+    }
+    void step()
+    {
+        grad_eval_(objective_, x_, obj_eval_, obj_v_);
+        for (size_t i = 0; i < x_.size(); ++i) {
+            update_(x_[i], g_[i].get());
+        }
+    };
+    ArgumentContainer&       arguments() { return derived().x_; }
+    const ArgumentContainer& arguments() const { return derived().x_; }
+
+    RealType&       objective_value() { return derived().obj_v_; }
+    const RealType& objective_value() const { return derived().obj_v_; }
+    std::vector<std::reference_wrapper<RealType>>&       gradients() { return derived().g_; }
+    const std::vector<std::reference_wrapper<RealType>>& gradients() const { return derived().g_; }
+};
+} // namespace optimization
+} // namespace math
+} // namespace boost
+#endif
diff --git a/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp b/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp
new file mode 100644
index 000000000..0951ce93e
--- /dev/null
+++ b/include/boost/math/optimization/detail/rdiff_optimization_policies.hpp
@@ -0,0 +1,140 @@
+#ifndef RDIFF_OPTIMIZATION_POLICIES_HPP__
+#define RDIFF_OPTIMIZATION_POLICIES_HPP__
+
+#include <boost/math/differentiation/autodiff_reverse.hpp>
+#include <boost/random.hpp>
+#include <random>
+
+namespace boost {
+namespace math {
+namespace optimization {
+
+namespace rdiff = boost::math::differentiation::reverse_mode;
+
+/******************************************************************
+ * @brief> function evaluation policy for reverse mode autodiff
+ * @arg objective> objective function to evaluate
+ * @arg x> argument list
+*/
+template<typename RealType>
+struct reverse_mode_function_eval_policy
+{
+    template<typename Objective, class ArgumentContainer>
+    rdiff::rvar<RealType, 1> operator()(Objective &&objective, ArgumentContainer &x)
+    {
+        auto &tape = rdiff::get_active_tape<RealType, 1>();
+        tape.zero_grad();
+        tape.rewind_to_last_checkpoint();
+        return objective(x);
+    }
+};
+/******************************************************************
+ * @brief> gradient evaluation policy
+ * @arg obj_f> objective
+ * @arg x> argument list
+ * @arg f_eval_olicy> funciton evaluation policy. These need to be
+ *                    done in tandem
+ * @arg obj_v> reference to variable inside gradient class 
+*/
+template<typename RealType>
+struct reverse_mode_gradient_evaluation_policy
+{
+    template<typename ArgumentContainer>
+    void bind(ArgumentContainer &x, std::vector<std::reference_wrapper<RealType>> &g)
+    {
+        g.reserve(x.size());
+        for (auto &xi : x) {
+            g.push_back(std::ref(xi.adjoint()));
+        }
+    }
+    template<class Objective, class ArgumentContainer, class FunctionEvaluationPolicy>
+    void operator()(Objective                &&obj_f,
+                    ArgumentContainer         &x,
+                    FunctionEvaluationPolicy &&f_eval_pol,
+                    RealType                  &obj_v)
+    {
+        // compute objective via eval policy that takes care of tape
+        rdiff::rvar<RealType, 1> v = f_eval_pol(obj_f, x);
+        v.backward();
+        obj_v = v.item();
+    }
+};
+
+/*************************************************************************
+ * update policy
+ */
+template<typename RealType>
+struct nesterov_update_policy
+{
+    RealType lr_;
+    RealType mu_;
+    nesterov_update_policy(RealType lr, RealType mu)
+        : lr_(lr)
+        , mu_(mu)
+    {}
+
+    void operator()(rdiff::rvar<RealType, 1> &x) {}
+};
+/******************************************************************
+ * init policies
+ */
+template<typename RealType>
+struct tape_initializer_rvar
+{
+    template<class ArgumentContainer>
+    void operator()(ArgumentContainer &) const noexcept
+    {
+        static_assert(std::is_same<typename ArgumentContainer::value_type,
+                                   rdiff::rvar<RealType, 1> >::value,
+                      "ArgumentContainer::value_type must be rdiff::rvar<RealType,1>");
+        auto &tape = rdiff::get_active_tape<RealType, 1>();
+        tape.add_checkpoint();
+    }
+};
+
+template<typename RealType>
+struct random_uniform_initializer_rvar
+{
+    RealType low_, high_;
+    size_t   seed_;
+    random_uniform_initializer_rvar(RealType low  = 0,
+                                    RealType high = 1,
+                                    size_t   seed = std::random_device{}())
+        : low_(low)
+        , high_(high)
+        , seed_(seed){};
+
+    template<class ArgumentContainer>
+    void operator()(ArgumentContainer &x) const
+    {
+        boost::random::mt19937                             gen(seed_);
+        boost::random::uniform_real_distribution<RealType> dist(low_, high_);
+        for (auto &xi : x) {
+            xi = rdiff::rvar<RealType, 1>(dist(gen));
+        }
+        auto &tape = rdiff::get_active_tape<RealType, 1>();
+        tape.add_checkpoint();
+    }
+};
+
+template<typename RealType>
+struct costant_initializer_rvar
+{
+    RealType constant;
+    explicit costant_initializer_rvar(RealType v = 0)
+        : constant(v){};
+    template<class ArgumentContainer>
+    void operator()(ArgumentContainer &x) const
+    {
+        for (auto &xi : x) {
+            xi = rdiff::rvar<RealType, 1>(constant);
+        }
+        auto &tape = rdiff::get_active_tape<RealType, 1>();
+        tape.add_checkpoint();
+    }
+};
+} // namespace optimization
+} // namespace math
+} // namespace boost
+
+#endif
diff --git a/include/boost/math/optimization/gradient_descent.hpp b/include/boost/math/optimization/gradient_descent.hpp
new file mode 100644
index 000000000..d0888f714
--- /dev/null
+++ b/include/boost/math/optimization/gradient_descent.hpp
@@ -0,0 +1,143 @@
+#ifndef GRADIENT_DESCENT_HPP
+#define GRADIENT_DESCENT_HPP
+#include <boost/math/optimization/detail/differentiable_opt_utilties.hpp>
+#include <boost/math/optimization/detail/gradient_opt_base.hpp>
+#include <boost/math/optimization/detail/rdiff_optimization_policies.hpp>
+#include <functional>
+#include <vector>
+namespace rdiff = boost::math::differentiation::reverse_mode;
+
+namespace boost {
+namespace math {
+namespace optimization {
+
+template<typename RealType>
+struct gradient_descent_update_policy
+{
+    RealType lr_;
+    gradient_descent_update_policy(RealType lr)
+        : lr_(lr){};
+
+    template<typename ArgumentType,
+             typename = typename std::enable_if<boost::math::differentiation::reverse_mode::detail::
+                                                    is_expression<ArgumentType>::value>::type>
+    void operator()(ArgumentType &x, RealType &g)
+    {
+        // this update effectively "mutes" the tape
+        // TODO: add a tape scope guard method so that
+        // you can do math on autodiff types without
+        // accumulating gradients
+        x.get_value() -= lr_ * g;
+    }
+    template<
+        typename ArgumentType,
+        typename std::enable_if<
+            !boost::math::differentiation::reverse_mode::detail::is_expression<ArgumentType>::value,
+            int>::type
+        = 0>
+    void operator()(ArgumentType &x, RealType &g) const
+    {
+        x -= lr_ * g;
+    }
+};
+template<typename ArgumentContainer,
+         typename RealType,
+         class Objective,
+         class InitializationPolicy,
+         class ObjectiveEvalPolicy,
+         class GradEvalPolicy>
+class gradient_descent : public abstract_optimizer<ArgumentContainer,
+                                                   RealType,
+                                                   Objective,
+                                                   InitializationPolicy,
+                                                   ObjectiveEvalPolicy,
+                                                   GradEvalPolicy,
+                                                   gradient_descent_update_policy<RealType>,
+                                                   gradient_descent<ArgumentContainer,
+                                                                    RealType,
+                                                                    Objective,
+                                                                    InitializationPolicy,
+                                                                    ObjectiveEvalPolicy,
+                                                                    GradEvalPolicy>>
+{
+    using base_opt = abstract_optimizer<ArgumentContainer,
+                                        RealType,
+                                        Objective,
+                                        InitializationPolicy,
+                                        ObjectiveEvalPolicy,
+                                        GradEvalPolicy,
+                                        gradient_descent_update_policy<RealType>,
+                                        gradient_descent<ArgumentContainer,
+                                                         RealType,
+                                                         Objective,
+                                                         InitializationPolicy,
+                                                         ObjectiveEvalPolicy,
+                                                         GradEvalPolicy>>;
+
+public:
+    using base_opt::base_opt;
+};
+template<class Objective, typename ArgumentContainer, typename RealType>
+auto make_gradient_descent(Objective &&obj, ArgumentContainer &x, RealType lr = RealType{0.01})
+{
+    return gradient_descent<ArgumentContainer,
+                            RealType,
+                            Objective,
+                            tape_initializer_rvar<RealType>,
+                            reverse_mode_function_eval_policy<RealType>,
+                            reverse_mode_gradient_evaluation_policy<RealType>>(
+        std::forward<Objective>(obj),
+        x,
+        tape_initializer_rvar<RealType>{},
+        reverse_mode_function_eval_policy<RealType>{},
+        reverse_mode_gradient_evaluation_policy<RealType>{},
+        gradient_descent_update_policy<RealType>(lr));
+}
+template<class Objective, typename ArgumentContainer, typename RealType, class InitializationPolicy>
+auto make_gradient_descent(Objective            &&obj,
+                           ArgumentContainer     &x,
+                           RealType               lr,
+                           InitializationPolicy &&ip)
+{
+    return gradient_descent<ArgumentContainer,
+                            RealType,
+                            Objective,
+                            InitializationPolicy,
+                            reverse_mode_function_eval_policy<RealType>,
+                            reverse_mode_gradient_evaluation_policy<RealType>>(
+        std::forward<Objective>(obj),
+        x,
+        std::forward<InitializationPolicy>(ip),
+        reverse_mode_function_eval_policy<RealType>{},
+        reverse_mode_gradient_evaluation_policy<RealType>{},
+        gradient_descent_update_policy<RealType>(lr));
+}
+template<typename ArgumentContainer,
+         typename RealType,
+         class Objective,
+         class InitializationPolicy,
+         class ObjectiveEvalPolicy,
+         class GradEvalPolicy>
+auto make_gradient_descent(Objective            &&obj,
+                           ArgumentContainer     &x,
+                           RealType              &lr,
+                           InitializationPolicy &&ip,
+                           ObjectiveEvalPolicy  &&oep,
+                           GradEvalPolicy       &&gep)
+{
+    return gradient_descent<ArgumentContainer,
+                            RealType,
+                            Objective,
+                            InitializationPolicy,
+                            ObjectiveEvalPolicy,
+                            GradEvalPolicy>(std::forward<Objective>(obj),
+                                            x,
+                                            std::forward<InitializationPolicy>(ip),
+                                            std::forward<ObjectiveEvalPolicy>(oep),
+                                            std::forward<GradEvalPolicy>(gep),
+                                            gradient_descent_update_policy<RealType>{lr});
+}
+} // namespace optimization
+} // namespace math
+} // namespace boost
+#endif
diff --git a/include/boost/math/optimization/gradient_optimizers.hpp b/include/boost/math/optimization/gradient_optimizers.hpp
new file mode 100644
index 000000000..38a98aac2
--- /dev/null
+++ b/include/boost/math/optimization/gradient_optimizers.hpp
@@ -0,0 +1,12 @@
+#ifndef GRADIENT_OPTIMIZERS_HPP
+#define GRADIENT_OPTIMIZERS_HPP
+#include <boost/math/differentiation/autodiff_reverse.hpp>
+#include <boost/math/optimization/gradient_descent.hpp>
+
+namespace boost {
+namespace math {
+namespace optimization {
+} // namespace optimization
+} // namespace math
+} // namespace boost
+#endif
diff --git a/include/boost/math/optimization/minimizer.hpp b/include/boost/math/optimization/minimizer.hpp
new file mode 100644
index 000000000..d7d63e9a0
--- /dev/null
+++ b/include/boost/math/optimization/minimizer.hpp
@@ -0,0 +1,321 @@
+#ifndef MINIMIZER_HPP
+#define MINIMIZER_HPP
+#include <boost/math/optimization/detail/differentiable_opt_utilties.hpp>
+#include <boost/math/optimization/gradient_optimizers.hpp>
+#include <vector>
+namespace boost {
+namespace math {
+namespace optimization {
+template<typename RealType>
+struct optimization_result
+{
+    size_t                num_iter = 0;
+    RealType              objective_value;
+    std::vector<RealType> objective_history;
+    bool                  converged;
+};
+
+template<typename RealType>
+std::ostream& operator<<(std::ostream& os, const optimization_result<RealType>& r)
+{
+    os << "optimization_result {\n"
+       << "  num_iter        = " << r.num_iter << "\n"
+       << "  objective_value = " << r.objective_value << "\n"
+       << "  converged       = " << std::boolalpha << r.converged << "\n"
+       << "  objective_history = [";
+
+    for (std::size_t i = 0; i < r.objective_history.size(); ++i) {
+        os << r.objective_history[i];
+        if (i + 1 < r.objective_history.size()) {
+            os << ", ";
+        }
+    }
+    os << "]\n}\n";
+    return os;
+}
+/*****************************************************************************************/
+template<typename RealType>
+struct gradient_norm_convergence_policy
+{
+    RealType tol_;
+    explicit gradient_norm_convergence_policy(RealType tol)
+        : tol_(tol)
+    {}
+
+    template<class GradientContainer>
+    bool operator()(const GradientContainer& g, RealType /*objective_v*/) const
+    {
+        return gradient_norm2(g) < tol_;
+    }
+};
+
+template<typename RealType>
+struct objective_tol_convergence_policy
+{
+    RealType         tol_;
+    mutable RealType last_value_;
+    mutable bool     first_call_;
+
+    explicit objective_tol_convergence_policy(RealType tol)
+        : tol_(tol)
+        , last_value_(0)
+        , first_call_(true)
+    {}
+
+    template<class GradientContainer>
+    bool operator()(const GradientContainer&, RealType objective_v) const
+    {
+        if (first_call_) {
+            last_value_ = objective_v;
+            first_call_ = false;
+            return false;
+        }
+        RealType diff = abs(objective_v - last_value_);
+        last_value_   = objective_v;
+        return diff < tol_;
+    }
+};
+
+template<typename RealType>
+struct relative_objective_tol_policy
+{
+    RealType         rel_tol_;
+    mutable RealType last_value_;
+    mutable bool     first_call_;
+
+    explicit relative_objective_tol_policy(RealType rel_tol)
+        : rel_tol_(rel_tol)
+        , last_value_(0)
+        , first_call_(true)
+    {}
+
+    template<class GradientContainer>
+    bool operator()(const GradientContainer&, RealType objective_v) const
+    {
+        if (first_call_) {
+            last_value_ = objective_v;
+            first_call_ = false;
+            return false;
+        }
+        RealType denom    = std::max<RealType>(1, std::abs(last_value_));
+        RealType rel_diff = std::abs(objective_v - last_value_) / denom;
+        last_value_       = objective_v;
+        return rel_diff < rel_tol_;
+    }
+};
+
+template<class Policy1, class Policy2>
+struct combined_convergence_policy
+{
+    Policy1 p1_;
+    Policy2 p2_;
+
+    combined_convergence_policy(Policy1 p1, Policy2 p2)
+        : p1_(p1)
+        , p2_(p2)
+    {}
+
+    template<class GradientContainer, class RealType>
+    bool operator()(const GradientContainer& g, RealType obj) const
+    {
+        return p1_(g, obj) || p2_(g, obj);
+    }
+};
+
+/*****************************************************************************************/
+struct max_iter_termination_policy
+{
+    size_t max_iter_;
+    max_iter_termination_policy(size_t max_iter)
+        : max_iter_(max_iter){};
+    bool operator()(size_t iter)
+    {
+        if (iter < max_iter_) {
+            return false;
+        }
+        return true;
+    }
+};
+
+struct wallclock_termination_policy
+{
+    std::chrono::steady_clock::time_point start_;
+    std::chrono::milliseconds             max_time_;
+
+    explicit wallclock_termination_policy(std::chrono::milliseconds max_time)
+        : start_(std::chrono::steady_clock::now())
+        , max_time_(max_time)
+    {}
+
+    bool operator()(size_t /*iter*/) const
+    {
+        return std::chrono::steady_clock::now() - start_ > max_time_;
+    }
+};
+
+/*****************************************************************************************/
+template<typename ArgumentContainer>
+struct unconstrained_policy
+{
+    void operator()(ArgumentContainer&) {}
+};
+
+template<typename ArgumentContainer, typename RealType>
+struct box_constraints
+{
+    RealType min_, max_;
+    box_constraints(RealType min, RealType max)
+        : min_(min)
+        , max_(max){};
+    void     operator()(ArgumentContainer& x)
+    {
+        for (auto& xi : x) {
+            xi = (xi < min_) ? min_ : (max_ < xi) ? max_ : xi;
+        }
+    }
+};
+
+template<typename ArgumentContainer, typename RealType>
+struct nonnegativity_constraint
+{
+    void operator()(ArgumentContainer& x) const
+    {
+        for (auto& xi : x) {
+            if (xi < RealType{0})
+                xi = RealType{0};
+        }
+    }
+};
+template<typename ArgumentContainer, typename RealType>
+struct l2_ball_constraint
+{
+    RealType radius_;
+
+    explicit l2_ball_constraint(RealType radius)
+        : radius_(radius)
+    {}
+
+    void operator()(ArgumentContainer& x) const
+    {
+        RealType norm2 = RealType{0};
+        for (auto& xi : x)
+            norm2 += xi * xi;
+        RealType norm = std::sqrt(norm2);
+        if (norm > radius_) {
+            RealType scale = radius_ / norm;
+            for (auto& xi : x)
+                xi *= scale;
+        }
+    }
+};
+
+template<typename ArgumentContainer, typename RealType>
+struct l1_ball_constraint
+{
+    RealType radius_;
+
+    explicit l1_ball_constraint(RealType radius)
+        : radius_(radius)
+    {}
+
+    void operator()(ArgumentContainer& x) const
+    {
+        RealType norm1 = RealType{0};
+        for (auto& xi : x)
+            norm1 += std::abs(xi);
+
+        if (norm1 > radius_) {
+            RealType scale = radius_ / norm1;
+            for (auto& xi : x)
+                xi *= scale;
+        }
+    }
+};
+template<typename ArgumentContainer, typename RealType>
+struct simplex_constraint
+{
+    void operator()(ArgumentContainer& x) const
+    {
+        RealType sum = RealType{0};
+        for (auto& xi : x) {
+            if (xi < RealType{0})
+                xi = RealType{0}; // clip negatives
+            sum += xi;
+        }
+        if (sum > RealType{0}) {
+            for (auto& xi : x)
+                xi /= sum;
+        }
+    }
+};
+
+template<typename ArgumentContainer>
+struct function_constraint
+{
+    using func_t = void (*)(ArgumentContainer&);
+
+    func_t f_;
+
+    explicit function_constraint(func_t f)
+        : f_(f)
+    {}
+
+    void operator()(ArgumentContainer& x) const { f_(x); }
+};
+template<typename ArgumentContainer, typename RealType>
+struct unit_sphere_constraint
+{
+    void operator()(ArgumentContainer& x) const
+    {
+        RealType norm2 = RealType{0};
+        for (auto& xi : x)
+            norm2 += xi * xi;
+        RealType norm = std::sqrt(norm2);
+        if (norm > RealType{0}) {
+            for (auto& xi : x)
+                xi /= norm;
+        }
+    }
+};
+/*****************************************************************************************/
+
+template<class Optimizer, class ConstraintPolicy, class ConvergencePolicy, class TerminationPolicy>
+auto minimize_impl(Optimizer&        opt,
+                   ConstraintPolicy  project,
+                   ConvergencePolicy converged,
+                   TerminationPolicy terminate,
+                   bool              history)
+{
+    optimization_result<typename Optimizer::real_type_t> result;
+    size_t                                               iter = 0;
+    do {
+        opt.step();
+        project(opt.arguments());
+        ++iter;
+        if (history) {
+            result.objective_history.push_back(opt.objective_value());
+        }
+
+    } while (!converged(opt.gradients(), opt.objective_value()) && !terminate(iter));
+    result.num_iter        = iter;
+    result.objective_value = opt.objective_value();
+    result.converged       = converged(opt.gradients(), opt.objective_value());
+    return result;
+}
+template<class Optimizer,
+         class ConstraintPolicy = unconstrained_policy<typename Optimizer::argument_container_t>,
+         class ConvergencePolicy = gradient_norm_convergence_policy<typename Optimizer::real_type_t>,
+         class TerminationPolicy = max_iter_termination_policy>
+auto minimize(Optimizer&        opt,
+              ConstraintPolicy  project = ConstraintPolicy{},
+              ConvergencePolicy converged
+              = ConvergencePolicy{static_cast<typename Optimizer::real_type_t>(1e-5)},
+              TerminationPolicy terminate = TerminationPolicy(10000),
+              bool              history   = false)
+{
+    return minimize_impl(opt, project, converged, terminate, history);
+}
+} // namespace optimization
+} // namespace math
+} // namespace boost
+#endif
diff --git a/include/boost/math/optimization/nesterov.hpp b/include/boost/math/optimization/nesterov.hpp
new file mode 100644
index 000000000..e34e146a6
--- /dev/null
+++ b/include/boost/math/optimization/nesterov.hpp
@@ -0,0 +1,15 @@
+#ifndef NESTEROV_HPP
+#define NESTEROV_HPP
+#include <boost/math/optimization/detail/differentiable_opt_utilties.hpp>
+#include <boost/math/optimization/detail/gradient_opt_base.hpp>
+#include <functional>
+#include <vector>
+
+namespace boost {
+namespace math {
+namespace optimization {
+
+} // namespace optimization
+} // namespace math
+} // namespace boost
+#endif // NESTEROV_HPP
diff --git a/test/test_functions_for_optimization.hpp b/test/test_functions_for_optimization.hpp
index b091984f8..88a9a77d0 100644
--- a/test/test_functions_for_optimization.hpp
+++ b/test/test_functions_for_optimization.hpp
@@ -12,6 +12,16 @@
 #include <array>
 #include <vector>
 
+/* simple n-d quadratic function */
+template<typename RealType>
+RealType quadratic(std::vector<RealType> &x)
+{
+    RealType res{0.0};
+    for (auto &item : x) {
+        res += item * item;
+    }
+    return res;
+}
 // Taken from: https://en.wikipedia.org/wiki/Test_functions_for_optimization
 template <typename Real> Real ackley(std::array<Real, 2> const &v) {
   using std::sqrt;
diff --git a/test/test_gradient_descent_optimizer.cpp b/test/test_gradient_descent_optimizer.cpp
new file mode 100644
index 000000000..86d7b38eb
--- /dev/null
+++ b/test/test_gradient_descent_optimizer.cpp
@@ -0,0 +1,328 @@
+#include "test_autodiff_reverse.hpp" // reuse for some basic options
+#include "test_functions_for_optimization.hpp"
+#include <boost/math/differentiation/autodiff_reverse.hpp>
+#include <boost/math/optimization/gradient_descent.hpp>
+#include <boost/math/optimization/minimizer.hpp>
+namespace rdiff = boost::math::differentiation::reverse_mode;
+namespace bopt  = boost::math::optimization;
+BOOST_AUTO_TEST_SUITE(basic_gradient_descent)
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(default_gd_test, T, all_float_types)
+{
+    size_t                         NITER = 2000;
+    size_t                         N     = 15;
+    T                              lr    = T{1e-2};
+    RandomSample<T>                rng{T(-100), (100)};
+    std::vector<rdiff::rvar<T, 1>> x_ad;
+    T                              eps = T{1e-3};
+    for (size_t i = 0; i < N; ++i) {
+        x_ad.push_back(rng.next());
+    }
+    auto gdopt = bopt::make_gradient_descent(&quadratic<rdiff::rvar<T, 1>>, x_ad, lr);
+    for (size_t i = 0; i < NITER; ++i) {
+        gdopt.step();
+    }
+    for (auto& x : x_ad) {
+        BOOST_REQUIRE_SMALL(x.item(), eps);
+    }
+}
+BOOST_AUTO_TEST_CASE_TEMPLATE(test_minimize, T, all_float_types)
+{
+    size_t                         NITER = 2000;
+    size_t                         N     = 15;
+    T                              lr    = T{1e-2};
+    RandomSample<T>                rng{T(-100), (100)};
+    std::vector<rdiff::rvar<T, 1>> x_ad;
+    T                              eps = T{1e-3};
+    for (size_t i = 0; i < N; ++i) {
+        x_ad.push_back(rng.next());
+    }
+    auto gdopt = bopt::make_gradient_descent(&quadratic<rdiff::rvar<T, 1>>, x_ad, lr);
+    auto z     = minimize(gdopt);
+    for (auto& x : x_ad) {
+        BOOST_REQUIRE_SMALL(x.item(), eps);
+    }
+}
+BOOST_AUTO_TEST_CASE_TEMPLATE(random_initializer_test, T, all_float_types)
+{
+    size_t                         N  = 10;
+    T                              lr = T{1e-2};
+    std::vector<rdiff::rvar<T, 1>> x(N);
+
+    auto gdopt = bopt::make_gradient_descent(
+        &quadratic<rdiff::rvar<T, 1>>,
+        x,
+        lr,
+        bopt::random_uniform_initializer_rvar<T>(-2.0, 2.0, 1234)); // all initialized to 5
+    for (auto& xi : x) {
+        T v = xi.item();
+        BOOST_TEST(v >= -2);
+        BOOST_TEST(v <= 2);
+    }
+    gdopt.step();
+}
+BOOST_AUTO_TEST_CASE_TEMPLATE(const_initializer_test, T, all_float_types)
+{
+    size_t                         N  = 10;
+    T                              lr = T{1e-2};
+    std::vector<rdiff::rvar<T, 1>> x(N);
+
+    auto gdopt = bopt::make_gradient_descent(&quadratic<rdiff::rvar<T, 1>>,
+                                             x,
+                                             lr,
+                                             bopt::costant_initializer_rvar<T>(
+                                                 T{5.0})); // all initialized to 5
+
+    for (auto& xi : x) {
+        T v = xi.item();
+        BOOST_REQUIRE_CLOSE(v, T{5.0}, T{1e-3});
+    }
+    gdopt.step();
+}
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(box_constraint_test, T, all_float_types)
+{
+    size_t                         N  = 5;
+    T                              lr = T{1e-2};
+    std::vector<rdiff::rvar<T, 1>> x(N, T{10});
+
+    auto gdopt = bopt::make_gradient_descent(&quadratic<rdiff::rvar<T, 1>>, x, lr);
+
+    auto res = bopt::minimize(gdopt,
+                              bopt::box_constraints<std::vector<rdiff::rvar<T, 1>>, T>(-1.0, 1.0));
+
+    for (auto& xi : x) {
+        BOOST_TEST(xi.item() >= -1.0);
+        BOOST_TEST(xi.item() <= 1.0);
+    }
+}
+BOOST_AUTO_TEST_CASE_TEMPLATE(max_iter_test, T, all_float_types)
+{
+    size_t                         N  = 2;
+    T                              lr = T{1e-6}; // very slow learning
+    std::vector<rdiff::rvar<T, 1>> x  = {T{5}, T{5}};
+
+    auto gdopt = bopt::make_gradient_descent(&quadratic<rdiff::rvar<T, 1>>, x, lr);
+
+    size_t max_iter = 50;
+    auto   res      = bopt::minimize(gdopt,
+                              bopt::unconstrained_policy<std::vector<rdiff::rvar<T, 1>>>{},
+                              bopt::gradient_norm_convergence_policy<T>(T{1e-20}),
+                              bopt::max_iter_termination_policy(max_iter));
+
+    BOOST_TEST(!res.converged); // should not converge with tiny lr
+    BOOST_REQUIRE_EQUAL(res.num_iter, max_iter);
+}
+BOOST_AUTO_TEST_CASE_TEMPLATE(history_tracking_test, T, all_float_types)
+{
+    size_t                         N  = 3;
+    T                              lr = T{1e-2};
+    std::vector<rdiff::rvar<T, 1>> x  = {T{3}, T{-4}, T{5}};
+
+    auto gdopt = bopt::make_gradient_descent(&quadratic<rdiff::rvar<T, 1>>, x, lr);
+
+    auto res = bopt::minimize(gdopt,
+                              bopt::unconstrained_policy<std::vector<rdiff::rvar<T, 1>>>{},
+                              bopt::gradient_norm_convergence_policy<T>(T{1e-6}),
+                              bopt::max_iter_termination_policy(1000),
+                              true); // enable history
+
+    BOOST_TEST(!res.objective_history.empty());
+    BOOST_TEST(res.objective_history.front() > res.objective_history.back());
+}
+BOOST_AUTO_TEST_CASE_TEMPLATE(rosenbrock_test, T, all_float_types)
+{
+    std::array<rdiff::rvar<T, 1>, 2> x  = {T{-1.2}, T{1.0}}; // bad start
+    T                                lr = T{1e-3};
+
+    auto gdopt = bopt::make_gradient_descent(&rosenbrock_saddle<rdiff::rvar<T, 1>>, x, lr);
+
+    auto res = bopt::minimize(gdopt,
+                              bopt::unconstrained_policy<std::array<rdiff::rvar<T, 1>, 2>>{},
+                              bopt::gradient_norm_convergence_policy<T>(T{1e-4}),
+                              bopt::max_iter_termination_policy(50000));
+    BOOST_TEST(res.converged);
+    BOOST_REQUIRE_CLOSE(x[0].item(), T{1.0}, T{1e-1});
+    BOOST_REQUIRE_CLOSE(x[1].item(), T{1.0}, T{1e-1});
+}
+BOOST_AUTO_TEST_CASE(objective_tol_convergence_test)
+{
+    using policy_t = bopt::objective_tol_convergence_policy<double>;
+    policy_t            pol(1e-3);
+    std::vector<double> dummy_grad;
+
+    BOOST_TEST(!pol(dummy_grad, 100.0));
+    BOOST_TEST(!pol(dummy_grad, 99.0));
+    BOOST_TEST(pol(dummy_grad, 99.0005));
+}
+
+BOOST_AUTO_TEST_CASE(relative_objective_tol_test)
+{
+    using policy_t = bopt::relative_objective_tol_policy<double>;
+    policy_t pol(1e-3);
+
+    std::vector<double> dummy_grad;
+    BOOST_TEST(!pol(dummy_grad, 1000.0));
+    BOOST_TEST(!pol(dummy_grad, 1010.0));
+    BOOST_TEST(pol(dummy_grad, 1010.5));
+}
+
+BOOST_AUTO_TEST_CASE(combined_policy_test)
+{
+    using pol_abs  = bopt::objective_tol_convergence_policy<double>;
+    using pol_rel  = bopt::relative_objective_tol_policy<double>;
+    using pol_comb = bopt::combined_convergence_policy<pol_abs, pol_rel>;
+
+    pol_abs  abs_pol(1e-6);
+    pol_rel  rel_pol(1e-3);
+    pol_comb comb(abs_pol, rel_pol);
+
+    std::vector<double> dummy_grad;
+
+    BOOST_TEST(!comb(dummy_grad, 100.0));
+    BOOST_TEST(!comb(dummy_grad, 110.0));
+    BOOST_TEST(comb(dummy_grad, 110.1));
+    BOOST_TEST(comb(dummy_grad, 110.1000001));
+}
+BOOST_AUTO_TEST_CASE(nonnegativity_constraint_test)
+{
+    std::vector<double>                                         x = {1.0, -2.0, 3.0, -4.0};
+    bopt::nonnegativity_constraint<std::vector<double>, double> proj;
+    proj(x);
+
+    for (auto& xi : x)
+        BOOST_TEST(xi >= 0.0);
+    BOOST_TEST(x == std::vector<double>({1.0, 0.0, 3.0, 0.0}));
+}
+
+BOOST_AUTO_TEST_CASE(l2_ball_constraint_test)
+{
+    std::vector<double>                                   x = {3.0, 4.0}; // norm = 5
+    bopt::l2_ball_constraint<std::vector<double>, double> proj(1.0);
+    proj(x);
+
+    double norm = std::sqrt(x[0] * x[0] + x[1] * x[1]);
+    BOOST_TEST(std::abs(norm - 1.0) < 1e-12); // projected to unit circle
+}
+
+BOOST_AUTO_TEST_CASE(l1_ball_constraint_test)
+{
+    std::vector<double>                                   x = {3.0, 4.0}; // L1 norm = 7
+    bopt::l1_ball_constraint<std::vector<double>, double> proj(2.0);
+    proj(x);
+
+    double norm1 = std::abs(x[0]) + std::abs(x[1]);
+    BOOST_TEST(std::abs(norm1 - 2.0) < 1e-12);
+}
+
+BOOST_AUTO_TEST_CASE(simplex_constraint_test)
+{
+    std::vector<double> x = {-1.0, 2.0, 3.0}; // has negative and sum != 1
+    bopt::simplex_constraint<std::vector<double>, double> proj;
+    proj(x);
+
+    double sum = 0.0;
+    for (auto& xi : x) {
+        BOOST_TEST(xi >= 0.0); // all nonnegative
+        sum += xi;
+    }
+    BOOST_TEST(std::abs(sum - 1.0) < 1e-12); // normalized to sum=1
+}
+
+BOOST_AUTO_TEST_CASE(unit_sphere_constraint_test)
+{
+    std::vector<double>                                       x = {0.3, 0.4}; // norm = 0.5
+    bopt::unit_sphere_constraint<std::vector<double>, double> proj;
+    proj(x);
+
+    double norm = std::sqrt(x[0] * x[0] + x[1] * x[1]);
+    BOOST_TEST(std::abs(norm - 1.0) < 1e-12); // always projected to sphere
+}
+
+BOOST_AUTO_TEST_CASE(function_constraint_test)
+{
+    auto clip_to_half = [](std::vector<double>& v) {
+        for (auto& xi : v)
+            if (xi > 0.5)
+                xi = 0.5;
+    };
+
+    bopt::function_constraint<std::vector<double>> proj(clip_to_half);
+    std::vector<double>                            x = {0.2, 0.7, 1.5};
+    proj(x);
+
+    BOOST_TEST(x == std::vector<double>({0.2, 0.5, 0.5}));
+}
+
+template<typename RealType>
+struct no_init_policy
+{
+    void operator()(std::vector<RealType>& x) const noexcept {}
+};
+
+template<typename RealType>
+struct analytic_objective_eval_pol
+{
+    template<typename Objective, typename ArgumentContainer>
+    RealType operator()(Objective&& objective, ArgumentContainer& x)
+    {
+        return objective(x);
+    }
+};
+
+template<typename RealType>
+struct analytic_gradient_eval_pol
+{
+    std::vector<RealType> grad_container;
+    template<typename ArgumentContainer>
+    void bind(ArgumentContainer& x, std::vector<std::reference_wrapper<RealType>>& g)
+    {
+        grad_container.resize(x.size(), RealType{0.0});
+        g.clear();
+        g.reserve(x.size());
+        for (size_t i = 0; i < x.size(); ++i) {
+            g.push_back(std::ref(grad_container[i]));
+        }
+    }
+    template<class Objective, class ArgumentContainer, class FunctionEvaluationPolicy>
+    void operator()(Objective&&                obj_f,
+                    ArgumentContainer&         x,
+                    FunctionEvaluationPolicy&& f_eval_pol,
+                    RealType&                  obj_v)
+    {
+        // compute objective via eval policy that takes care of tape
+        RealType v = f_eval_pol(obj_f, x);
+        obj_v      = v;
+        for (size_t i = 0; i < x.size(); ++i) {
+            grad_container[i] = 2 * x[i];
+        }
+    }
+};
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(analytic_derivative_policies, T, all_float_types)
+{
+    std::vector<T>  x;
+    size_t          NITER = 2000;
+    size_t          N     = 15;
+    T               lr    = T{1e-2};
+    RandomSample<T> rng{T(-100), (100)};
+    T               eps = T{1e-3};
+    for (size_t i = 0; i < N; ++i) {
+        x.push_back(rng.next());
+    }
+
+    auto gdopt = bopt::make_gradient_descent(&quadratic<T>,
+                                             x,
+                                             lr,
+                                             no_init_policy<T>{},
+                                             analytic_objective_eval_pol<T>{},
+                                             analytic_gradient_eval_pol<T>{});
+
+    for (size_t i = 0; i < NITER; ++i) {
+        gdopt.step();
+    }
+    for (auto& xi : x) {
+        BOOST_REQUIRE_SMALL(xi, eps);
+    }
+}
+BOOST_AUTO_TEST_SUITE_END()