mirror of
https://github.com/boostorg/math.git
synced 2026-01-19 04:22:09 +00:00
CUDA: Get everything working with lambda expressions, and tidy up example and docs.
This commit is contained in:
@@ -56,7 +56,7 @@ usable on the CUDA device. These per-thread generators are initialized via rand
|
||||
of type `MasterGen`: this defaults to `std::random_device` but could equally be a psuedo-random number generator.
|
||||
It is important though to ensure that the per-thread random generators do not become correlated, or generate
|
||||
overlapping sequences. For this reason type `ThreadGen` should have a long repeat period, and at the very least
|
||||
be a differnet type from `MasterGen`.
|
||||
be a different type from `MasterGen`.
|
||||
|
||||
A call to member function `integrate` performs the actual integration, and also controls how the CUDA device is used:
|
||||
|
||||
@@ -74,32 +74,46 @@ this first pass is used to get an estimate of the variance and calculate how man
|
||||
be required by the second pass to reach the desired error bound.
|
||||
* `max_calls_per_thread`: the maximum number of calls of the integrand by each thread in any single CUDA
|
||||
invocation. This parameter is used to prevent operating system timeouts from terminating the application.
|
||||
For example on Windows if the CUDA accellerated display driver is unresponsive for more than 2 seconds, then
|
||||
For example on Windows if the CUDA accelerated display driver is unresponsive for more than 2 seconds, then
|
||||
the application using it will simply be terminated.
|
||||
* `is_compensated`: when `true` each thread will use Kahan-style compensated addition to ensure accuracy. When
|
||||
`false` then regular addition will be used for improved performance.
|
||||
|
||||
For example:
|
||||
|
||||
// Define a function to integrate:
|
||||
auto g = [](std::vector<double> const & x)
|
||||
{
|
||||
constexpr const double A = 1.0 / (M_PI * M_PI * M_PI);
|
||||
return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
|
||||
};
|
||||
std::vector<std::pair<double, double>> bounds{{0, M_PI}, {0, M_PI}, {0, M_PI}};
|
||||
double error_goal = 0.001
|
||||
naive_monte_carlo<double, decltype(g)> mc(g, bounds);
|
||||
// Define a function to integrate:
|
||||
auto g = [] __device__ (const double* x)
|
||||
{
|
||||
constexpr const double pi = boost::math::constants::pi<double>();
|
||||
constexpr const double A = 1.0 / (pi * pi * pi);
|
||||
return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
|
||||
};
|
||||
std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
|
||||
double error_goal = 0.001;
|
||||
cuda_naive_monte_carlo<double, decltype(g)> mc(g, bounds);
|
||||
|
||||
double result = mc.integrate(error_goal);
|
||||
double result = mc.integrate(error_goal);
|
||||
|
||||
std::cout << "Integration result is: " << result << std::endl;
|
||||
|
||||
First off, we define the function we wish to integrate.
|
||||
This function must accept a `std::vector<Real> const &`, and return a `Real`.
|
||||
Next, we define the domain of integration.
|
||||
This function must accept a `const Real*`, and return a `Real`.
|
||||
In comparison to the regular non-CUDA code we loose a good deal of type safety
|
||||
as CUDA deals in raw pointers, not vectors - so it's up to the client to
|
||||
ensure that the number of elements accessed matches the number of dimensions passed
|
||||
to the integrators constructor.
|
||||
Also note that we've used a lambda expression for the integrand: this currently requires
|
||||
compilation with `-expt-extended-lambda`.
|
||||
|
||||
Next, we define the domain of integration as a vector of pairs - in this case
|
||||
the domain is [0, PI] over 3 dimensions.
|
||||
|
||||
std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
|
||||
|
||||
The call
|
||||
|
||||
naive_monte_carlo<double, decltype(g)> mc(g, bounds);
|
||||
|
||||
creates an instance of the monte carlo integrator, and the call to `integrate` then returns the result.
|
||||
creates an instance of the Monte-Carlo integrator, and the call to `integrate` then returns the result.
|
||||
|
||||
[endsect]
|
||||
|
||||
@@ -30,7 +30,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
|
||||
};
|
||||
|
||||
template <class F, class Gen, class Real>
|
||||
void __global__ cuda_naive_monte_carlo_device_proc(const F* f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
|
||||
void __global__ cuda_naive_monte_carlo_device_proc(F f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
|
||||
{
|
||||
int id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
@@ -44,7 +44,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
|
||||
{
|
||||
for (unsigned j = 0; j < n_dimentions; ++j)
|
||||
storage_base[j] = p_start_locations[j] + p_scales[j] * dist(gen);
|
||||
Real fv = (*f)(storage_base);
|
||||
Real fv = (f)(storage_base);
|
||||
Real y = fv - c;
|
||||
Real t = sum + y;
|
||||
c = (t - sum) - y;
|
||||
@@ -62,7 +62,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
|
||||
}
|
||||
|
||||
template <class F, class Gen, class Real>
|
||||
void __global__ cuda_naive_monte_carlo_fast_device_proc(const F* f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
|
||||
void __global__ cuda_naive_monte_carlo_fast_device_proc(F f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
|
||||
{
|
||||
int id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
@@ -75,7 +75,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
|
||||
{
|
||||
for (unsigned j = 0; j < n_dimentions; ++j)
|
||||
storage_base[j] = p_start_locations[j] + p_scales[j] * dist(gen);
|
||||
Real fv = (*f)(storage_base);
|
||||
Real fv = (f)(storage_base);
|
||||
sum += fv;
|
||||
sigma_squared += fv * fv;
|
||||
}
|
||||
@@ -96,7 +96,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
|
||||
{
|
||||
cuda_naive_monte_carlo(const F& integrand,
|
||||
std::vector<std::pair<Real, Real>> const & bounds,
|
||||
const MasterGen& seed) : m_gen(seed), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
|
||||
const MasterGen& seed) : m_f(integrand), m_gen(seed), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
|
||||
{
|
||||
auto it = bounds.begin();
|
||||
while (it != bounds.end())
|
||||
@@ -109,7 +109,7 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
|
||||
}
|
||||
cuda_naive_monte_carlo(const F& integrand,
|
||||
std::vector<std::pair<Real, Real>> const & bounds)
|
||||
: m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
|
||||
: m_f(integrand), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
|
||||
{
|
||||
auto it = bounds.begin();
|
||||
while (it != bounds.end())
|
||||
@@ -145,8 +145,6 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
|
||||
for (unsigned i = 0; i < host_seeds.size(); ++i)
|
||||
host_seeds[i].seed(ui_dist(m_gen));
|
||||
seeds = host_seeds;
|
||||
thrust::device_vector<F> proc(1);
|
||||
proc[0] = m_f;
|
||||
bool first_call = true;
|
||||
bool have_variance = false;
|
||||
Real sample_variance = 0;
|
||||
@@ -177,9 +175,9 @@ namespace boost { namespace math { namespace quadrature { namespace detail{
|
||||
calls_per_thread = max_calls_per_thread;
|
||||
// std::cout << "Executing with calls_per_thread = " << calls_per_thread << std::endl;
|
||||
if(is_compensated)
|
||||
detail::cuda_naive_monte_carlo_device_proc << <threads / 256, 256 >> > (to_pointer(proc.data()), to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
|
||||
detail::cuda_naive_monte_carlo_device_proc << <threads / 256, 256 >> > (m_f, to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
|
||||
else
|
||||
detail::cuda_naive_monte_carlo_fast_device_proc << <threads / 256, 256 >> > (to_pointer(proc.data()), to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
|
||||
detail::cuda_naive_monte_carlo_fast_device_proc << <threads / 256, 256 >> > (m_f, to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
|
||||
first_call = false;
|
||||
m_calls += threads * calls_per_thread;
|
||||
// If we haven't been called before then get an estimate of the sample
|
||||
|
||||
@@ -171,6 +171,28 @@ int main(void)
|
||||
|
||||
test_host_hypersphere_10();
|
||||
|
||||
/* Example code from docs */
|
||||
|
||||
{
|
||||
|
||||
// Define a function to integrate:
|
||||
auto g = [] __device__ (const double* x)
|
||||
{
|
||||
constexpr const double pi = boost::math::constants::pi<double>();
|
||||
constexpr const double A = 1.0 / (pi * pi * pi);
|
||||
return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
|
||||
};
|
||||
std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
|
||||
double error_goal = 0.001;
|
||||
cuda_naive_monte_carlo<double, decltype(g)> mc(g, bounds);
|
||||
|
||||
double result = mc.integrate(error_goal);
|
||||
|
||||
std::cout << "Integration result is: " << result << std::endl;
|
||||
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user