Minor code updates to Machine Learning scenario (#516)

This commit is contained in:
Peter Turcan
2025-10-14 11:31:57 -07:00
committed by GitHub
parent 6f393920f0
commit 3f7aea4738

View File

@@ -117,9 +117,9 @@ int main() {
C = prod(A, B);
// Print results
std::cout << "Matrix A (random values):\n" << A << "\n\n";
std::cout << "Matrix B (random values):\n" << B << "\n\n";
std::cout << "Result of A * B:\n" << C << "\n";
std::cout << "Matrix A (random values):\n" << std::setprecision(51) << A << "\n\n";
std::cout << "Matrix B (random values):\n" << std::setprecision(51) << B << "\n\n";
std::cout << "Result of A * B:\n" << std::setprecision(51) << C << "\n";
return 0;
}
@@ -131,13 +131,13 @@ Running the code should give you output similar to the following:
[source,text]
----
Matrix A (random values):
[3,3]((0.6344,0.797229,0.149486),(0.205832,0.854583,0.444135),(0.175206,0.261295,0.244765))
[3,3]((0.070812058635056018829345703125,0.80709342076443135738372802734375,0.6618001046590507030487060546875),(0.849498252384364604949951171875,0.95166688528843224048614501953125,0.8414736413396894931793212890625),(0.732556092552840709686279296875,0.607468723319470882415771484375,0.10045330529101192951202392578125))
Matrix B (random values):
[3,3]((0.622424,0.111231,0.326372),(0.148841,0.4861,0.0497033),(0.876468,0.0207629,0.314664))
[3,3]((0.8722223858349025249481201171875,0.7344769672490656375885009765625,0.66293510119430720806121826171875),(0.36406232439912855625152587890625,0.86651482223533093929290771484375,0.35279963747598230838775634765625),(0.75558476778678596019744873046875,0.78821337711997330188751220703125,0.7253504456020891666412353515625))
Result of A * B:
[3,3]((0.644546,0.461201,0.293713),(0.644582,0.447529,0.249407),(0.362473,0.151586,0.147188))
[3,3]((0.855642247899371810907712815330583566719724331051111,1.27300793356359748150765862084732304992940044030547,0.811723066325901586521348457514690721836814191192389),(1.72322211665940665047775867679824557399115292355418,2.11183114262540423141214021574008086190588073804975,1.50927322274462834299961835893277850573213072493672),(0.936009285567514438288188455272731403056241106241941,1.14360486900502322343527519810102432984422193840146,0.772815742467169064793160171422670146057498641312122))
----
@@ -272,21 +272,24 @@ Save the Iris file to your local computer, and update the following code with th
[source,cpp]
----
#include <fstream>
#include <boost/numeric/ublas/vector.hpp>
#include <boost/random.hpp>
#include <boost/random/random_device.hpp>
#include <fstream>
#include <boost/numeric/ublas/vector.hpp> // Boost linear algebra: vector, matrix, etc.
#include <boost/random.hpp> // Boost Random library (random_device, mt19937, distributions)
#include <boost/random/random_device.hpp> // Ensures random_device is available
namespace br = boost::random; // Alias for convenience
using namespace boost::numeric::ublas; // For uBLAS vector<>
using DataPoint = vector<double>; // A single data point = 4 numeric features
using Cluster = std::vector<DataPoint>; // A cluster = collection of data points
namespace br = boost::random;
using namespace boost::numeric::ublas;
using DataPoint = vector<double>;
using Cluster = std::vector<DataPoint>;
constexpr size_t FEATURES = 4;
constexpr size_t K = 3;
constexpr size_t MAX_ITER = 100;
// Constants for the Iris dataset and K-Means parameters
constexpr size_t FEATURES = 4; // Number of features per sample (Iris = 4)
constexpr size_t K = 3; // Number of clusters (3 Iris species)
constexpr size_t MAX_ITER = 100; // Max number of K-Means iterations
// -----------------------------
// Load data from CSV file
// -----------------------------
std::vector<DataPoint> load_iris_csv(const std::string& filename) {
std::ifstream file(filename);
std::vector<DataPoint> data;
@@ -296,20 +299,22 @@ std::vector<DataPoint> load_iris_csv(const std::string& filename) {
throw std::runtime_error("Could not open file.");
}
// Skip the first line of text
// Skip header line (contains column names)
std::getline(file, line);
//sepal_length, sepal_width, petal_length, petal_width, species
// CSV format: sepal_length, sepal_width, petal_length, petal_width, species
while (std::getline(file, line)) {
std::stringstream ss(line);
std::string token;
DataPoint point(FEATURES);
// Parse first 4 numeric values
for (size_t i = 0; i < FEATURES; ++i) {
if (!std::getline(ss, token, ',')) break;
point(i) = std::stod(token);
}
// Only add valid 4-element vectors
if (point.size() == FEATURES)
data.push_back(point);
}
@@ -317,6 +322,9 @@ std::vector<DataPoint> load_iris_csv(const std::string& filename) {
return data;
}
// -----------------------------
// Euclidean distance function
// -----------------------------
double euclidean_distance(const DataPoint& a, const DataPoint& b) {
double sum = 0.0;
for (size_t i = 0; i < a.size(); ++i)
@@ -324,6 +332,9 @@ double euclidean_distance(const DataPoint& a, const DataPoint& b) {
return std::sqrt(sum);
}
// -----------------------------
// Find index of nearest centroid
// -----------------------------
size_t closest_centroid(const DataPoint& point, const std::vector<DataPoint>& centroids) {
double min_dist = std::numeric_limits<double>::max();
size_t index = 0;
@@ -337,51 +348,67 @@ size_t closest_centroid(const DataPoint& point, const std::vector<DataPoint>& ce
return index;
}
// -----------------------------
// Compute centroid (mean of cluster points)
// -----------------------------
DataPoint compute_centroid(const Cluster& cluster) {
DataPoint centroid(FEATURES, 0.0);
if (cluster.empty()) return centroid;
for (const auto& point : cluster)
centroid += point;
return centroid / static_cast<double>(cluster.size());
centroid += point; // Sum all data points in cluster
return centroid / static_cast<double>(cluster.size()); // Average each dimension
}
// -----------------------------
// Initialize centroids randomly
// -----------------------------
std::vector<DataPoint> init_random_centroids(const std::vector<DataPoint>& data, size_t k) {
br::random_device rd; // Seed from system entropy
br::mt19937 gen(rd()); // Mersenne Twister RNG
br::uniform_int_distribution<> dist(0, data.size() - 1);
br::random_device rd; // Hardware entropy source
br::mt19937 gen(rd()); // Mersenne Twister engine
br::uniform_int_distribution<> dist(0, data.size() - 1); // Random index range
std::vector<DataPoint> centroids;
for (size_t i = 0; i < k; ++i)
centroids.push_back(data[dist(gen)]);
centroids.push_back(data[dist(gen)]); // Pick random samples as initial centroids
return centroids;
}
// -----------------------------
// K-Means clustering algorithm
// -----------------------------
void kmeans(const std::vector<DataPoint>& data, size_t k) {
auto centroids = init_random_centroids(data, k);
std::vector<size_t> assignments(data.size(), 0);
std::vector<size_t> assignments(data.size(), 0); // Each data points assigned cluster
for (size_t iter = 0; iter < MAX_ITER; ++iter) {
bool changed = false;
std::vector<Cluster> clusters(k);
std::vector<Cluster> clusters(k); // One cluster per centroid
// Step 1: Assign each point to the closest centroid
for (size_t i = 0; i < data.size(); ++i) {
size_t idx = closest_centroid(data[i], centroids);
if (idx != assignments[i]) {
changed = true;
changed = true; // Track if any point changed cluster
assignments[i] = idx;
}
clusters[idx].push_back(data[i]);
}
// Stop early if clusters no longer change
if (!changed) {
std::cout << "\nConverged after " << iter << " iterations.\n";
break;
}
// Step 2: Recompute centroids as cluster means
for (size_t i = 0; i < k; ++i)
centroids[i] = compute_centroid(clusters[i]);
}
// -----------------------------
// Output cluster summaries
// -----------------------------
for (size_t i = 0; i < k; ++i) {
std::cout << "Cluster " << i + 1 << ": "
<< std::count(assignments.begin(), assignments.end(), i)
@@ -389,15 +416,20 @@ void kmeans(const std::vector<DataPoint>& data, size_t k) {
}
}
// -----------------------------
// Main program
// -----------------------------
int main() {
try {
// Enter the path to your copy of the Iris data
auto data = load_iris_csv("<path>/iris.csv");
// Load the Iris dataset (CSV path required)
auto data = load_iris_csv("<path>\\iris.csv");
std::cout << "Loaded " << data.size() << " samples.\n";
// Run K-Means clustering
kmeans(data, K);
}
catch (const std::exception& e) {
// Handle file or numeric parsing errors gracefully
std::cerr << "Error: " << e.what() << "\n";
return 1;
}