From 33e2c249dd3f6bcfb7a002d2005a373628b94fc2 Mon Sep 17 00:00:00 2001 From: mzhelyez Date: Mon, 25 Aug 2025 12:47:32 +0200 Subject: [PATCH] modified docs --- doc/differentiation/autodiff_reverse.qbk | 25 ++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/differentiation/autodiff_reverse.qbk b/doc/differentiation/autodiff_reverse.qbk index fd4b79e12..16afaee4f 100644 --- a/doc/differentiation/autodiff_reverse.qbk +++ b/doc/differentiation/autodiff_reverse.qbk @@ -15,7 +15,13 @@ namespace differentiation { namespace reverse_mode { - /* autodiff variable of type RealType (numeric types), stores derivatives up to DerivativeOrder*/ + /* autodiff variable of type RealType (numeric types), stores derivatives up to DerivativeOrder + * rvar inherits from a generic expression base class + * expression> + * This is a Curiously Recurring Template Pattern(CRTP) + * The purpose is so that rvar acts as a terminal node in an expression graph, and can be combined + * with other expression-based types (sums, producs, etc..) to form expression graphs. + */ template class rvar : public expression> { // inner_t of rvar = var @@ -60,6 +66,11 @@ } // gradient tape holds the computational graph + /* + * The expression graph is stored on a tape. The tape is closely related to the memory manegement + * system in the library. BOOST_MATH_BUFFER_SIZE is set to 65536. It controls the block size of the + * internal memory arena. Its a macro, and can be set at compile time. + */ template class gradient_tape { @@ -316,7 +327,17 @@ The model and loss functions are called. This is just to initialize y_fit and lo // 5. Gradient Descent Loop double learning_rate = 1e-3; -The learning rate is a tunable parameter determining the "velocity" with which we descend to a solution. +The learning rate controls how large a step we take in the direction of the +negative gradient each iteration. Intuitively, it sets the "velocity" of +descent toward a minimum. + +Too high: the optimization may overshoot minima, oscillate, or even diverge. +Too low: convergence will be very slow and may stall in shallow regions. + +In practice, values in the range [1e-4, 1e-1] are common starting points, +with 1e-3 being a typical safe default for many problems. The best choice +depends on the scale of the data, the model, and the curvature of the loss +landscape. while (loss_v > 0.005) { tape.zero_grad(); // zero out all the adjoints