CUDA: Get everything working with lambda expressions, and tidy up example and docs.

2026-01-19 04:22:09 +00:00 · 2018-07-09 18:36:37 +01:00
parent f8f18c1b97
commit c6c13faf44
3 changed files with 59 additions and 25 deletions
--- a/doc/quadrature/cuda_naive_monte_carlo.qbk
+++ b/doc/quadrature/cuda_naive_monte_carlo.qbk
@@ -56,7 +56,7 @@ usable on the CUDA device.  These per-thread generators are initialized via rand
 of type `MasterGen`: this defaults to `std::random_device` but could equally be a psuedo-random number generator.
 It is important though to ensure that the per-thread random generators do not become correlated, or generate
 overlapping sequences.  For this reason type `ThreadGen` should have a long repeat period, and at the very least 
-be a differnet type from `MasterGen`.
+be a different type from `MasterGen`.

 A call to member function `integrate` performs the actual integration, and also controls how the CUDA device is used:

@@ -74,32 +74,46 @@ this first pass is used to get an estimate of the variance and calculate how man
 be required by the second pass to reach the desired error bound.
 * `max_calls_per_thread`: the maximum number of calls of the integrand by each thread in any single CUDA
 invocation.  This parameter is used to prevent operating system timeouts from terminating the application.
-For example on Windows if the CUDA accellerated display driver is unresponsive for more than 2 seconds, then
+For example on Windows if the CUDA accelerated display driver is unresponsive for more than 2 seconds, then
 the application using it will simply be terminated.
 * `is_compensated`: when `true` each thread will use Kahan-style compensated addition to ensure accuracy.  When
 `false` then regular addition will be used for improved performance.

 For example:

-    // Define a function to integrate:
-    auto g = [](std::vector<double> const & x)
-    {
-      constexpr const double A = 1.0 / (M_PI * M_PI * M_PI);
-      return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
-    };
-    std::vector<std::pair<double, double>> bounds{{0, M_PI}, {0, M_PI}, {0, M_PI}};
-    double error_goal = 0.001
-    naive_monte_carlo<double, decltype(g)> mc(g, bounds);
+      // Define a function to integrate:
+      auto g = [] __device__ (const double* x)
+      {
+         constexpr const double pi = boost::math::constants::pi<double>();
+         constexpr const double A = 1.0 / (pi * pi * pi);
+         return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
+      };
+      std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
+      double error_goal = 0.001;
+      cuda_naive_monte_carlo<double, decltype(g)> mc(g, bounds);

-    double result = mc.integrate(error_goal);
+      double result = mc.integrate(error_goal);
+
+      std::cout << "Integration result is: " << result << std::endl;

 First off, we define the function we wish to integrate.
-This function must accept a `std::vector<Real> const &`, and return a `Real`.
-Next, we define the domain of integration.
+This function must accept a `const Real*`, and return a `Real`.
+In comparison to the regular non-CUDA code we loose a good deal of type safety
+as CUDA deals in raw pointers, not vectors - so it's up to the client to
+ensure that the number of elements accessed matches the number of dimensions passed
+to the integrators constructor.
+Also note that we've used a lambda expression for the integrand: this currently requires
+compilation with `-expt-extended-lambda`.
+
+Next, we define the domain of integration as a vector of pairs - in this case
+the domain is [0, PI] over 3 dimensions.
+
+   std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
+
 The call

    naive_monte_carlo<double, decltype(g)> mc(g, bounds);

-creates an instance of the monte carlo integrator, and the call to `integrate` then returns the result.
+creates an instance of the Monte-Carlo integrator, and the call to `integrate` then returns the result.

 [endsect]
--- a/include/boost/math/quadrature/cuda_naive_monte_carlo.hpp
+++ b/include/boost/math/quadrature/cuda_naive_monte_carlo.hpp
@@ -30,7 +30,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
   };

   template <class F, class Gen, class Real>
-   void __global__ cuda_naive_monte_carlo_device_proc(const F* f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
+   void __global__ cuda_naive_monte_carlo_device_proc(F f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
   {
      int id = blockIdx.x * blockDim.x + threadIdx.x;

@@ -44,7 +44,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
      {
         for (unsigned j = 0; j < n_dimentions; ++j)
            storage_base[j] = p_start_locations[j] + p_scales[j] * dist(gen);
-         Real fv = (*f)(storage_base);
+         Real fv = (f)(storage_base);
         Real y = fv - c;
         Real t = sum + y;
         c = (t - sum) - y;
@@ -62,7 +62,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
   }

   template <class F, class Gen, class Real>
-   void __global__ cuda_naive_monte_carlo_fast_device_proc(const F* f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
+   void __global__ cuda_naive_monte_carlo_fast_device_proc(F f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
   {
      int id = blockIdx.x * blockDim.x + threadIdx.x;

@@ -75,7 +75,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
      {
         for (unsigned j = 0; j < n_dimentions; ++j)
            storage_base[j] = p_start_locations[j] + p_scales[j] * dist(gen);
-         Real fv = (*f)(storage_base);
+         Real fv = (f)(storage_base);
         sum += fv;
         sigma_squared += fv * fv;
      }
@@ -96,7 +96,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
   {
      cuda_naive_monte_carlo(const F& integrand,
         std::vector<std::pair<Real, Real>> const & bounds,
-         const MasterGen& seed) : m_gen(seed), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
+         const MasterGen& seed) : m_f(integrand), m_gen(seed), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
      {
         auto it = bounds.begin();
         while (it != bounds.end())
@@ -109,7 +109,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
      }
      cuda_naive_monte_carlo(const F& integrand,
         std::vector<std::pair<Real, Real>> const & bounds) 
-         : m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
+         : m_f(integrand), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
      {
         auto it = bounds.begin();
         while (it != bounds.end())
@@ -145,8 +145,6 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
         for (unsigned i = 0; i < host_seeds.size(); ++i)
            host_seeds[i].seed(ui_dist(m_gen));
         seeds = host_seeds;
-         thrust::device_vector<F> proc(1);
-         proc[0] = m_f;
         bool first_call = true;
         bool have_variance = false;
         Real sample_variance = 0;
@@ -177,9 +175,9 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
                  calls_per_thread = max_calls_per_thread;
               // std::cout << "Executing with calls_per_thread = " << calls_per_thread << std::endl;
               if(is_compensated)
-                  detail::cuda_naive_monte_carlo_device_proc << <threads / 256, 256 >> > (to_pointer(proc.data()), to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
+                  detail::cuda_naive_monte_carlo_device_proc << <threads / 256, 256 >> > (m_f, to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
               else
-                  detail::cuda_naive_monte_carlo_fast_device_proc << <threads / 256, 256 >> > (to_pointer(proc.data()), to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
+                  detail::cuda_naive_monte_carlo_fast_device_proc << <threads / 256, 256 >> > (m_f, to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
               first_call = false;
               m_calls += threads * calls_per_thread;
               // If we haven't been called before then get an estimate of the sample 
--- a/test/cuda/misc/test_naive_monte_carlo.cu
+++ b/test/cuda/misc/test_naive_monte_carlo.cu
@@ -171,6 +171,28 @@ int main(void)
   
   test_host_hypersphere_10();

+   /* Example code from docs */
+
+   {
+
+      // Define a function to integrate:
+      auto g = [] __device__ (const double* x)
+      {
+         constexpr const double pi = boost::math::constants::pi<double>();
+         constexpr const double A = 1.0 / (pi * pi * pi);
+         return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
+      };
+      std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
+      double error_goal = 0.001;
+      cuda_naive_monte_carlo<double, decltype(g)> mc(g, bounds);
+
+      double result = mc.integrate(error_goal);
+
+      std::cout << "Integration result is: " << result << std::endl;
+
+
+   }
+
   return 0;
 }