From 33e2c249dd3f6bcfb7a002d2005a373628b94fc2 Mon Sep 17 00:00:00 2001
From: mzhelyez <maksym.zhelyeznyakov@planopsim.com>
Date: Mon, 25 Aug 2025 12:47:32 +0200
Subject: [PATCH] modified docs

---
 doc/differentiation/autodiff_reverse.qbk | 25 ++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)
diff --git a/doc/differentiation/autodiff_reverse.qbk b/doc/differentiation/autodiff_reverse.qbk
index fd4b79e12..16afaee4f 100644
--- a/doc/differentiation/autodiff_reverse.qbk
+++ b/doc/differentiation/autodiff_reverse.qbk
@@ -15,7 +15,13 @@
     namespace differentiation {
     namespace reverse_mode {
 
-    /* autodiff variable of type RealType (numeric types), stores derivatives up to DerivativeOrder*/
+    /* autodiff variable of type RealType (numeric types), stores derivatives up to DerivativeOrder
+    * rvar inherits from a generic expression base class
+    * expression<RealType, DerivativeOrder, rvar<RealType, DerivativeOrder>>
+    * This is a Curiously Recurring Template Pattern(CRTP)
+    * The purpose is so that rvar acts as a terminal node in an expression graph, and can be combined
+    * with other expression-based types (sums, producs, etc..) to form expression graphs.
+    */
     template<typename RealType, size_t DerivativeOrder = 1>
     class rvar : public expression<RealType, DerivativeOrder, rvar<RealType, DerivativeOrder>> {
       // inner_t of rvar<RealType, N> = var<RealType, N-1>
@@ -60,6 +66,11 @@
     }
 
     // gradient tape holds the computational graph
+    /*
+    * The expression graph is stored on a tape. The tape is closely related to the memory manegement
+    * system in the library. BOOST_MATH_BUFFER_SIZE is set to 65536. It controls the block size of the
+    * internal memory arena. Its a macro, and can be set at compile time.
+    */
     template<typename RealType, size_t DerivativeOrder, size_t buffer_size = BOOST_MATH_BUFFER_SIZE>
     class gradient_tape {
 
@@ -316,7 +327,17 @@ The model and loss functions are called. This is just to initialize y_fit and lo
       // 5. Gradient Descent Loop
       double learning_rate = 1e-3;
 
-The learning rate is a tunable parameter determining the "velocity" with which we descend to a solution.
+The learning rate controls how large a step we take in the direction of the
+negative gradient each iteration. Intuitively, it sets the "velocity" of
+descent toward a minimum.
+
+Too high: the optimization may overshoot minima, oscillate, or even diverge.
+Too low: convergence will be very slow and may stall in shallow regions.
+
+In practice, values in the range [1e-4, 1e-1] are common starting points,
+with 1e-3 being a typical safe default for many problems. The best choice
+depends on the scale of the data, the model, and the curvature of the loss
+landscape.
 
       while (loss_v > 0.005) {
           tape.zero_grad(); // zero out all the adjoints