From 3f7aea4738a8b5032e8869e7b0590208c203c0ea Mon Sep 17 00:00:00 2001
From: Peter Turcan <peterjturcan@gmail.com>
Date: Tue, 14 Oct 2025 11:31:57 -0700
Subject: [PATCH] Minor code updates to Machine Learning scenario (#516)

---
 .../ROOT/pages/task-machine-learning.adoc     | 96 ++++++++++++-------
 1 file changed, 64 insertions(+), 32 deletions(-)
diff --git a/user-guide/modules/ROOT/pages/task-machine-learning.adoc b/user-guide/modules/ROOT/pages/task-machine-learning.adoc
index c1223cc..8fea76d 100644
--- a/user-guide/modules/ROOT/pages/task-machine-learning.adoc
+++ b/user-guide/modules/ROOT/pages/task-machine-learning.adoc
@@ -117,9 +117,9 @@ int main() {
     C = prod(A, B);
 
     // Print results
-    std::cout << "Matrix A (random values):\n" << A << "\n\n";
-    std::cout << "Matrix B (random values):\n" << B << "\n\n";
-    std::cout << "Result of A * B:\n" << C << "\n";
+    std::cout << "Matrix A (random values):\n" << std::setprecision(51) << A << "\n\n";
+    std::cout << "Matrix B (random values):\n" << std::setprecision(51) << B << "\n\n";
+    std::cout << "Result of A * B:\n" << std::setprecision(51) << C << "\n";
 
     return 0;
 }
@@ -131,13 +131,13 @@ Running the code should give you output similar to the following:
 [source,text]
 ----
 Matrix A (random values):
-[3,3]((0.6344,0.797229,0.149486),(0.205832,0.854583,0.444135),(0.175206,0.261295,0.244765))
+[3,3]((0.070812058635056018829345703125,0.80709342076443135738372802734375,0.6618001046590507030487060546875),(0.849498252384364604949951171875,0.95166688528843224048614501953125,0.8414736413396894931793212890625),(0.732556092552840709686279296875,0.607468723319470882415771484375,0.10045330529101192951202392578125))
 
 Matrix B (random values):
-[3,3]((0.622424,0.111231,0.326372),(0.148841,0.4861,0.0497033),(0.876468,0.0207629,0.314664))
+[3,3]((0.8722223858349025249481201171875,0.7344769672490656375885009765625,0.66293510119430720806121826171875),(0.36406232439912855625152587890625,0.86651482223533093929290771484375,0.35279963747598230838775634765625),(0.75558476778678596019744873046875,0.78821337711997330188751220703125,0.7253504456020891666412353515625))
 
 Result of A * B:
-[3,3]((0.644546,0.461201,0.293713),(0.644582,0.447529,0.249407),(0.362473,0.151586,0.147188))
+[3,3]((0.855642247899371810907712815330583566719724331051111,1.27300793356359748150765862084732304992940044030547,0.811723066325901586521348457514690721836814191192389),(1.72322211665940665047775867679824557399115292355418,2.11183114262540423141214021574008086190588073804975,1.50927322274462834299961835893277850573213072493672),(0.936009285567514438288188455272731403056241106241941,1.14360486900502322343527519810102432984422193840146,0.772815742467169064793160171422670146057498641312122))
 
 ----
 
@@ -272,21 +272,24 @@ Save the Iris file to your local computer, and update the following code with th
 
 [source,cpp]
 ----
-#include <fstream>
-#include <boost/numeric/ublas/vector.hpp>
-#include <boost/random.hpp>
-#include <boost/random/random_device.hpp>
+#include <fstream> 
+#include <boost/numeric/ublas/vector.hpp>      // Boost linear algebra: vector, matrix, etc.
+#include <boost/random.hpp>                    // Boost Random library (random_device, mt19937, distributions)
+#include <boost/random/random_device.hpp>      // Ensures random_device is available
 
+namespace br = boost::random;                  // Alias for convenience
+using namespace boost::numeric::ublas;         // For uBLAS vector<>
+using DataPoint = vector<double>;              // A single data point = 4 numeric features
+using Cluster = std::vector<DataPoint>;        // A cluster = collection of data points
 
-namespace br = boost::random;
-using namespace boost::numeric::ublas;
-using DataPoint = vector<double>;
-using Cluster = std::vector<DataPoint>;
-
-constexpr size_t FEATURES = 4;
-constexpr size_t K = 3;
-constexpr size_t MAX_ITER = 100;
+// Constants for the Iris dataset and K-Means parameters
+constexpr size_t FEATURES = 4;                 // Number of features per sample (Iris = 4)
+constexpr size_t K = 3;                        // Number of clusters (3 Iris species)
+constexpr size_t MAX_ITER = 100;               // Max number of K-Means iterations
 
+// -----------------------------
+// Load data from CSV file
+// -----------------------------
 std::vector<DataPoint> load_iris_csv(const std::string& filename) {
     std::ifstream file(filename);
     std::vector<DataPoint> data;
@@ -296,20 +299,22 @@ std::vector<DataPoint> load_iris_csv(const std::string& filename) {
         throw std::runtime_error("Could not open file.");
     }
 
-    // Skip the first line of text
+    // Skip header line (contains column names)
     std::getline(file, line);
 
-    //sepal_length, sepal_width, petal_length, petal_width, species
+    // CSV format: sepal_length, sepal_width, petal_length, petal_width, species
     while (std::getline(file, line)) {
         std::stringstream ss(line);
         std::string token;
         DataPoint point(FEATURES);
 
+        // Parse first 4 numeric values
         for (size_t i = 0; i < FEATURES; ++i) {
             if (!std::getline(ss, token, ',')) break;
             point(i) = std::stod(token);
         }
 
+        // Only add valid 4-element vectors
         if (point.size() == FEATURES)
             data.push_back(point);
     }
@@ -317,6 +322,9 @@ std::vector<DataPoint> load_iris_csv(const std::string& filename) {
     return data;
 }
 
+// -----------------------------
+// Euclidean distance function
+// -----------------------------
 double euclidean_distance(const DataPoint& a, const DataPoint& b) {
     double sum = 0.0;
     for (size_t i = 0; i < a.size(); ++i)
@@ -324,6 +332,9 @@ double euclidean_distance(const DataPoint& a, const DataPoint& b) {
     return std::sqrt(sum);
 }
 
+// -----------------------------
+// Find index of nearest centroid
+// -----------------------------
 size_t closest_centroid(const DataPoint& point, const std::vector<DataPoint>& centroids) {
     double min_dist = std::numeric_limits<double>::max();
     size_t index = 0;
@@ -337,51 +348,67 @@ size_t closest_centroid(const DataPoint& point, const std::vector<DataPoint>& ce
     return index;
 }
 
+// -----------------------------
+// Compute centroid (mean of cluster points)
+// -----------------------------
 DataPoint compute_centroid(const Cluster& cluster) {
     DataPoint centroid(FEATURES, 0.0);
     if (cluster.empty()) return centroid;
+
     for (const auto& point : cluster)
-        centroid += point;
-    return centroid / static_cast<double>(cluster.size());
+        centroid += point;                     // Sum all data points in cluster
+
+    return centroid / static_cast<double>(cluster.size()); // Average each dimension
 }
 
+// -----------------------------
+// Initialize centroids randomly
+// -----------------------------
 std::vector<DataPoint> init_random_centroids(const std::vector<DataPoint>& data, size_t k) {
-    br::random_device rd;  // Seed from system entropy
-    br::mt19937 gen(rd()); // Mersenne Twister RNG
-    br::uniform_int_distribution<> dist(0, data.size() - 1);
+    br::random_device rd;                      // Hardware entropy source
+    br::mt19937 gen(rd());                     // Mersenne Twister engine
+    br::uniform_int_distribution<> dist(0, data.size() - 1); // Random index range
     std::vector<DataPoint> centroids;
     for (size_t i = 0; i < k; ++i)
-        centroids.push_back(data[dist(gen)]);
+        centroids.push_back(data[dist(gen)]);  // Pick random samples as initial centroids
     return centroids;
 }
 
+// -----------------------------
+// K-Means clustering algorithm
+// -----------------------------
 void kmeans(const std::vector<DataPoint>& data, size_t k) {
     auto centroids = init_random_centroids(data, k);
-    std::vector<size_t> assignments(data.size(), 0);
+    std::vector<size_t> assignments(data.size(), 0); // Each data point’s assigned cluster
 
     for (size_t iter = 0; iter < MAX_ITER; ++iter) {
         bool changed = false;
-        std::vector<Cluster> clusters(k);
+        std::vector<Cluster> clusters(k);      // One cluster per centroid
 
+        // Step 1: Assign each point to the closest centroid
         for (size_t i = 0; i < data.size(); ++i) {
             size_t idx = closest_centroid(data[i], centroids);
             if (idx != assignments[i]) {
-                changed = true;
+                changed = true;                // Track if any point changed cluster
                 assignments[i] = idx;
             }
             clusters[idx].push_back(data[i]);
         }
 
+        // Stop early if clusters no longer change
         if (!changed) {
             std::cout << "\nConverged after " << iter << " iterations.\n";
             break;
         }
 
+        // Step 2: Recompute centroids as cluster means
         for (size_t i = 0; i < k; ++i)
             centroids[i] = compute_centroid(clusters[i]);
     }
 
+    // -----------------------------
     // Output cluster summaries
+    // -----------------------------
     for (size_t i = 0; i < k; ++i) {
         std::cout << "Cluster " << i + 1 << ": "
             << std::count(assignments.begin(), assignments.end(), i)
@@ -389,15 +416,20 @@ void kmeans(const std::vector<DataPoint>& data, size_t k) {
     }
 }
 
+// -----------------------------
+// Main program
+// -----------------------------
 int main() {
     try {
-
-        // Enter the path to your copy of the Iris data
-        auto data = load_iris_csv("<path>/iris.csv");
+        // Load the Iris dataset (CSV path required)
+        auto data = load_iris_csv("<path>\\iris.csv");
         std::cout << "Loaded " << data.size() << " samples.\n";
+
+        // Run K-Means clustering
         kmeans(data, K);
     }
     catch (const std::exception& e) {
+        // Handle file or numeric parsing errors gracefully
         std::cerr << "Error: " << e.what() << "\n";
         return 1;
     }