2
0
mirror of https://github.com/boostorg/math.git synced 2026-01-19 04:22:09 +00:00

CUDA: Get everything working with lambda expressions, and tidy up example and docs.

This commit is contained in:
jzmaddock
2018-07-09 18:36:37 +01:00
parent f8f18c1b97
commit c6c13faf44
3 changed files with 59 additions and 25 deletions

View File

@@ -56,7 +56,7 @@ usable on the CUDA device. These per-thread generators are initialized via rand
of type `MasterGen`: this defaults to `std::random_device` but could equally be a psuedo-random number generator.
It is important though to ensure that the per-thread random generators do not become correlated, or generate
overlapping sequences. For this reason type `ThreadGen` should have a long repeat period, and at the very least
be a differnet type from `MasterGen`.
be a different type from `MasterGen`.
A call to member function `integrate` performs the actual integration, and also controls how the CUDA device is used:
@@ -74,32 +74,46 @@ this first pass is used to get an estimate of the variance and calculate how man
be required by the second pass to reach the desired error bound.
* `max_calls_per_thread`: the maximum number of calls of the integrand by each thread in any single CUDA
invocation. This parameter is used to prevent operating system timeouts from terminating the application.
For example on Windows if the CUDA accellerated display driver is unresponsive for more than 2 seconds, then
For example on Windows if the CUDA accelerated display driver is unresponsive for more than 2 seconds, then
the application using it will simply be terminated.
* `is_compensated`: when `true` each thread will use Kahan-style compensated addition to ensure accuracy. When
`false` then regular addition will be used for improved performance.
For example:
// Define a function to integrate:
auto g = [](std::vector<double> const & x)
{
constexpr const double A = 1.0 / (M_PI * M_PI * M_PI);
return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
};
std::vector<std::pair<double, double>> bounds{{0, M_PI}, {0, M_PI}, {0, M_PI}};
double error_goal = 0.001
naive_monte_carlo<double, decltype(g)> mc(g, bounds);
// Define a function to integrate:
auto g = [] __device__ (const double* x)
{
constexpr const double pi = boost::math::constants::pi<double>();
constexpr const double A = 1.0 / (pi * pi * pi);
return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
};
std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
double error_goal = 0.001;
cuda_naive_monte_carlo<double, decltype(g)> mc(g, bounds);
double result = mc.integrate(error_goal);
double result = mc.integrate(error_goal);
std::cout << "Integration result is: " << result << std::endl;
First off, we define the function we wish to integrate.
This function must accept a `std::vector<Real> const &`, and return a `Real`.
Next, we define the domain of integration.
This function must accept a `const Real*`, and return a `Real`.
In comparison to the regular non-CUDA code we loose a good deal of type safety
as CUDA deals in raw pointers, not vectors - so it's up to the client to
ensure that the number of elements accessed matches the number of dimensions passed
to the integrators constructor.
Also note that we've used a lambda expression for the integrand: this currently requires
compilation with `-expt-extended-lambda`.
Next, we define the domain of integration as a vector of pairs - in this case
the domain is [0, PI] over 3 dimensions.
std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
The call
naive_monte_carlo<double, decltype(g)> mc(g, bounds);
creates an instance of the monte carlo integrator, and the call to `integrate` then returns the result.
creates an instance of the Monte-Carlo integrator, and the call to `integrate` then returns the result.
[endsect]

View File

@@ -30,7 +30,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
};
template <class F, class Gen, class Real>
void __global__ cuda_naive_monte_carlo_device_proc(const F* f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
void __global__ cuda_naive_monte_carlo_device_proc(F f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -44,7 +44,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
{
for (unsigned j = 0; j < n_dimentions; ++j)
storage_base[j] = p_start_locations[j] + p_scales[j] * dist(gen);
Real fv = (*f)(storage_base);
Real fv = (f)(storage_base);
Real y = fv - c;
Real t = sum + y;
c = (t - sum) - y;
@@ -62,7 +62,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
}
template <class F, class Gen, class Real>
void __global__ cuda_naive_monte_carlo_fast_device_proc(const F* f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
void __global__ cuda_naive_monte_carlo_fast_device_proc(F f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -75,7 +75,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
{
for (unsigned j = 0; j < n_dimentions; ++j)
storage_base[j] = p_start_locations[j] + p_scales[j] * dist(gen);
Real fv = (*f)(storage_base);
Real fv = (f)(storage_base);
sum += fv;
sigma_squared += fv * fv;
}
@@ -96,7 +96,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
{
cuda_naive_monte_carlo(const F& integrand,
std::vector<std::pair<Real, Real>> const & bounds,
const MasterGen& seed) : m_gen(seed), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
const MasterGen& seed) : m_f(integrand), m_gen(seed), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
{
auto it = bounds.begin();
while (it != bounds.end())
@@ -109,7 +109,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
}
cuda_naive_monte_carlo(const F& integrand,
std::vector<std::pair<Real, Real>> const & bounds)
: m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
: m_f(integrand), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
{
auto it = bounds.begin();
while (it != bounds.end())
@@ -145,8 +145,6 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
for (unsigned i = 0; i < host_seeds.size(); ++i)
host_seeds[i].seed(ui_dist(m_gen));
seeds = host_seeds;
thrust::device_vector<F> proc(1);
proc[0] = m_f;
bool first_call = true;
bool have_variance = false;
Real sample_variance = 0;
@@ -177,9 +175,9 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
calls_per_thread = max_calls_per_thread;
// std::cout << "Executing with calls_per_thread = " << calls_per_thread << std::endl;
if(is_compensated)
detail::cuda_naive_monte_carlo_device_proc << <threads / 256, 256 >> > (to_pointer(proc.data()), to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
detail::cuda_naive_monte_carlo_device_proc << <threads / 256, 256 >> > (m_f, to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
else
detail::cuda_naive_monte_carlo_fast_device_proc << <threads / 256, 256 >> > (to_pointer(proc.data()), to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
detail::cuda_naive_monte_carlo_fast_device_proc << <threads / 256, 256 >> > (m_f, to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
first_call = false;
m_calls += threads * calls_per_thread;
// If we haven't been called before then get an estimate of the sample

View File

@@ -171,6 +171,28 @@ int main(void)
test_host_hypersphere_10();
/* Example code from docs */
{
// Define a function to integrate:
auto g = [] __device__ (const double* x)
{
constexpr const double pi = boost::math::constants::pi<double>();
constexpr const double A = 1.0 / (pi * pi * pi);
return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
};
std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
double error_goal = 0.001;
cuda_naive_monte_carlo<double, decltype(g)> mc(g, bounds);
double result = mc.integrate(error_goal);
std::cout << "Integration result is: " << result << std::endl;
}
return 0;
}