From 814cb6e49f909cae34c666b605e79a15e584764f Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 30 Aug 2024 15:46:17 -0400
Subject: [PATCH] Add overview

---
 doc/math.qbk         |  1 +
 doc/overview/gpu.qbk | 59 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 doc/overview/gpu.qbk
diff --git a/doc/math.qbk b/doc/math.qbk
index d6b90efb0..385c93a5e 100644
--- a/doc/math.qbk
+++ b/doc/math.qbk
@@ -557,6 +557,7 @@ and as a CD ISBN 0-9504833-2-X  978-0-9504833-2-0, Classification 519.2-dc22.
 [include overview/standalone.qbk]
 [include overview/result_type_calc.qbk]
 [include overview/error_handling.qbk]
+[include overview/gpu.qbk]
 
 [section:compilers_overview Compilers]
 [compilers_overview]
diff --git a/doc/overview/gpu.qbk b/doc/overview/gpu.qbk
new file mode 100644
index 000000000..70f0164e0
--- /dev/null
+++ b/doc/overview/gpu.qbk
@@ -0,0 +1,59 @@
+[section:gpu Support for GPU programming in Boost.Math]
+
+[h4 GPU Support]
+
+Selected functions, distributions, tools, etc. support running on both host and devices.
+These functions will have the annotation `BOOST_MATH_GPU_ENABLED` next to their individual documentation.
+We test using CUDA (both NVCC and NVRTC) as well as SYCL to provide a wide range of support.
+
+[h4 How to build with device support]
+
+When compiling with CUDA or SYCL you will have to ensure that your code is being run inside of a kernel function.
+It is not enough to simply compile existing code with the NVCC compiler to run the code on the device.
+A simple CUDA kernel to run the Beta Distribution CDF on NVCC would be:
+
+    __global__ void cuda_beta_dist(const double* in, double* out, int num_elements)
+    {
+        const int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+        if (i < num_elements)
+        {
+            out[i] = cdf(boost::math::beta_distribution<double>(), in[i]);
+        }
+    }
+
+And on CUDA on NVRTC:
+
+    const char* cuda_kernel = R"(
+    #include <boost/math/distributions/beta.hpp>
+    extern "C" __global__ 
+    void test_beta_dist_kernel(const double* in, double* out, int num_elements)
+    {
+        const int i = blockDim.x * blockIdx.x + threadIdx.x;
+        if (i < num_elements)
+        {
+            out[i] = boost::math::cdf(boost::math::beta_distribution<double>(), in[i]);
+        }
+    }
+    )";
+
+And lastly on SYCL:
+
+    void sycl_beta_dist(const double* in, double* out, int num_elements, sycl::queue& q)
+    {
+        q.submit([&](sycl::handler& h) {
+            h.parallel_for(sycl::range<1>(num_elements), [=](sycl::id<1> i) {
+                out[i] = boost::math::cdf(boost::math::beta_distribution<double>(), in[i]);
+            });
+        });
+    }
+
+Once your kernel function has been written then use the framework mechanism for launching the kernel.
+
+[/ 
+  Copyright 2024. Matt Borland
+  Distributed under the Boost Software License, Version 1.0.
+  (See accompanying file LICENSE_1_0.txt or copy at
+  http://www.boost.org/LICENSE_1_0.txt).
+]
+