2
0
mirror of https://github.com/boostorg/math.git synced 2026-01-19 04:22:09 +00:00

Add bessel GPU support

Add GPU support to bessel_i0

Add CUDA and NVRTC testing

Add GPU support to bessel_i1

Add CUDA and NVRTC testing of bessel_i1

Add tgamma1pm1 NVRTC impl

Add GPU support to iconv

Add GPU test to bessel_ik

Add SYCL testing of complete bessel_i

Add GPU support to bessel_j0

Ignore BOOST_MATH_INSTRUMENT_VARIABLE on NVRTC

Add bessel J0 CUDA and NVRTC testing

Add GPU support to bessel_j1

Add bessel j1 CUDA and NVRTC testing

Add GPU support to bessel jn and jy

Add SYCL bessel j testing

Add bessel_k0 GPU support

Add bessel_k0 CUDA and NVRTC testing

Add GPU support to bessel_k1

Add bessel_k1 CUDA and NVRTC testing

Add GPU support to bessel_kn

Add bessel_kn CUDA and NVRTC testing

Add SYCL testing of complete bessel_k

Make newton-rhapson GPU compatible

Make the completed bessel functions GPU compatible

Add SYCL bessel y testing

Apply changes for non-empty policy on CUDA

Add NVCC cyl_bessel_i testing

Add GPU support to sinc

Add GPU support to series functions

Add GPU support to bessel_jy_zero

Add array helper type

Make hypot GPU safe

Make bessel_yX GPU capable

Add bessel_y0 and bessel_y1 CUDA testing

Add nvrtc testing of bessel_y0 and bessel_y1

Fix macros

Add missing header

Add missing header

Markup iconv

Add iround for NVRTC

Add tgamma1pm1 with policy overload for NVRTC

Disable header

Fix factorial support for CUDA platforms

Add definition of bessel traits

Add cyl_bessel_i NVRTC testing

Fix cyl_bessel_jy warnings

Fix CUDA forward declarations

Fix maybe-unused variable warning

Add CUDA cyl_bessel_j testing

Add sign overload for lgamma

Fix warnings

Add NVRTC cyl_bessel_j testing

Add NVCC sph_bessel testing

Add NVRTC testing of sph_bessel

Add NVRTC testing of cyl_bessel_k

Add NVCC testing of cyl_bessel_k

Add NVCC testing of cyl_neumann

Add NVRTC cyl_neumann testing

Add NVRTC sph_neumann testing

Add NVCC sph_neumann testing
This commit is contained in:
Matt Borland
2024-08-20 16:40:47 -04:00
parent 1d40454024
commit 047c206c30
104 changed files with 10079 additions and 675 deletions

View File

@@ -15,8 +15,14 @@
# pragma once
#endif
#include <limits>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/rational.hpp>
#include <boost/math/tools/promotion.hpp>
#include <boost/math/tools/series.hpp>
#include <boost/math/tools/roots.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/cstdint.hpp>
#include <boost/math/special_functions/detail/bessel_jy.hpp>
#include <boost/math/special_functions/detail/bessel_jn.hpp>
#include <boost/math/special_functions/detail/bessel_yn.hpp>
@@ -31,10 +37,8 @@
#include <boost/math/special_functions/sinc.hpp>
#include <boost/math/special_functions/trunc.hpp>
#include <boost/math/special_functions/round.hpp>
#include <boost/math/tools/rational.hpp>
#include <boost/math/tools/promotion.hpp>
#include <boost/math/tools/series.hpp>
#include <boost/math/tools/roots.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#ifdef _MSC_VER
# pragma warning(push)
@@ -43,6 +47,50 @@
namespace boost{ namespace math{
// Since we cannot pull this in from math fwd we need a copy
#ifdef BOOST_MATH_HAS_NVRTC
namespace detail{
typedef boost::math::integral_constant<int, 0> bessel_no_int_tag; // No integer optimisation possible.
typedef boost::math::integral_constant<int, 1> bessel_maybe_int_tag; // Maybe integer optimisation.
typedef boost::math::integral_constant<int, 2> bessel_int_tag; // Definite integer optimisation.
template <class T1, class T2, class Policy>
struct bessel_traits
{
using result_type = typename boost::math::conditional<
boost::math::is_integral<T1>::value,
typename tools::promote_args<T2>::type,
tools::promote_args_t<T1, T2>
>::type;
typedef typename policies::precision<result_type, Policy>::type precision_type;
using optimisation_tag = typename boost::math::conditional<
(precision_type::value <= 0 || precision_type::value > 64),
bessel_no_int_tag,
typename boost::math::conditional<
boost::math::is_integral<T1>::value,
bessel_int_tag,
bessel_maybe_int_tag
>::type
>::type;
using optimisation_tag128 = typename boost::math::conditional<
(precision_type::value <= 0 || precision_type::value > 113),
bessel_no_int_tag,
typename boost::math::conditional<
boost::math::is_integral<T1>::value,
bessel_int_tag,
bessel_maybe_int_tag
>::type
>::type;
};
} // detail
#endif
namespace detail{
template <class T, class Policy>
@@ -50,7 +98,7 @@ struct sph_bessel_j_small_z_series_term
{
typedef T result_type;
sph_bessel_j_small_z_series_term(unsigned v_, T x)
BOOST_MATH_GPU_ENABLED sph_bessel_j_small_z_series_term(unsigned v_, T x)
: N(0), v(v_)
{
BOOST_MATH_STD_USING
@@ -64,7 +112,7 @@ struct sph_bessel_j_small_z_series_term
term = pow(mult, T(v)) / boost::math::tgamma(v+1+T(0.5f), Policy());
mult *= -mult;
}
T operator()()
BOOST_MATH_GPU_ENABLED T operator()()
{
T r = term;
++N;
@@ -79,11 +127,11 @@ private:
};
template <class T, class Policy>
inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names
sph_bessel_j_small_z_series_term<T, Policy> s(v, x);
std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon<T, Policy>(), max_iter);
@@ -92,10 +140,21 @@ inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol)
}
template <class T, class Policy>
T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol)
BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp_final(T v, T x, const bessel_no_int_tag& t, const Policy& pol)
{
BOOST_MATH_STD_USING
static const char* function = "boost::math::bessel_j<%1%>(%1%,%1%)";
T result_J, y; // LCOV_EXCL_LINE
bessel_jy(v, x, &result_J, &y, need_j, pol);
return result_J;
}
// Dispatch funtion to avoid recursion
template <class T, class Policy>
BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol)
{
BOOST_MATH_STD_USING
if(x < 0)
{
// better have integer v:
@@ -105,23 +164,27 @@ T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol)
// This branch is hit by multiprecision types only, and is
// tested by our real_concept tests, but thee are excluded from coverage
// due to time constraints.
T r = cyl_bessel_j_imp(v, T(-x), t, pol);
T r = cyl_bessel_j_imp_final(T(v), T(-x), t, pol);
if (iround(v, pol) & 1)
{
r = -r;
}
return r;
// LCOV_EXCL_STOP
}
else
{
constexpr auto function = "boost::math::bessel_j<%1%>(%1%,%1%)";
return policies::raise_domain_error<T>(function, "Got x = %1%, but we need x >= 0", x, pol);
}
}
T result_J, y; // LCOV_EXCL_LINE
bessel_jy(v, x, &result_J, &y, need_j, pol);
return result_J;
return cyl_bessel_j_imp_final(T(v), T(x), t, pol);
}
template <class T, class Policy>
inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names.
int ival = detail::iconv(v, pol);
@@ -135,14 +198,14 @@ inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& p
}
template <class T, class Policy>
inline T cyl_bessel_j_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
{
BOOST_MATH_STD_USING
return bessel_jn(v, x, pol);
}
template <class T, class Policy>
inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names
if(x < 0)
@@ -171,7 +234,7 @@ inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol)
}
template <class T, class Policy>
T cyl_bessel_i_imp(T v, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED T cyl_bessel_i_imp_final(T v, T x, const Policy& pol)
{
//
// This handles all the bessel I functions, note that we don't optimise
@@ -180,20 +243,7 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol)
// case has better error handling too).
//
BOOST_MATH_STD_USING
static const char* function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)";
if(x < 0)
{
// better have integer v:
if(floor(v) == v)
{
T r = cyl_bessel_i_imp(v, T(-x), pol);
if(iround(v, pol) & 1)
r = -r;
return r;
}
else
return policies::raise_domain_error<T>(function, "Got x = %1%, but we need x >= 0", x, pol);
}
constexpr auto function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)";
if(x == 0)
{
if(v < 0)
@@ -210,7 +260,7 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol)
}
return sqrt(2 / (x * constants::pi<T>())) * sinh(x);
}
if((policies::digits<T, Policy>() <= 113) && (std::numeric_limits<T>::digits <= 113) && (std::numeric_limits<T>::radix == 2))
if((policies::digits<T, Policy>() <= 113) && (boost::math::numeric_limits<T>::digits <= 113) && (boost::math::numeric_limits<T>::radix == 2))
{
if(v == 0)
{
@@ -228,10 +278,39 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol)
return result_I;
}
// Additional dispatch function to get the GPU impls happy
template <class T, class Policy>
inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Policy& pol)
BOOST_MATH_GPU_ENABLED T cyl_bessel_i_imp(T v, T x, const Policy& pol)
{
static const char* function = "boost::math::cyl_bessel_k<%1%>(%1%,%1%)";
BOOST_MATH_STD_USING
constexpr auto function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)";
if(x < 0)
{
// better have integer v:
if(floor(v) == v)
{
T r = cyl_bessel_i_imp_final(T(v), T(-x), pol);
if(iround(v, pol) & 1)
{
r = -r;
}
return r;
}
else
{
return policies::raise_domain_error<T>(function, "Got x = %1%, but we need x >= 0", x, pol);
}
}
return cyl_bessel_i_imp_final(T(v), T(x), pol);
}
template <class T, class Policy>
BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Policy& pol)
{
constexpr auto function = "boost::math::cyl_bessel_k<%1%>(%1%,%1%)";
BOOST_MATH_STD_USING
if(x < 0)
{
@@ -248,7 +327,7 @@ inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Poli
}
template <class T, class Policy>
inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
{
BOOST_MATH_STD_USING
if((floor(v) == v))
@@ -259,15 +338,15 @@ inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& p
}
template <class T, class Policy>
inline T cyl_bessel_k_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
{
return bessel_kn(v, x, pol);
}
template <class T, class Policy>
inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol)
{
static const char* function = "boost::math::cyl_neumann<%1%>(%1%,%1%)";
constexpr auto function = "boost::math::cyl_neumann<%1%>(%1%,%1%)";
BOOST_MATH_INSTRUMENT_VARIABLE(v);
BOOST_MATH_INSTRUMENT_VARIABLE(x);
@@ -291,7 +370,7 @@ inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol)
}
template <class T, class Policy>
inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
{
BOOST_MATH_STD_USING
@@ -310,16 +389,16 @@ inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& po
}
template <class T, class Policy>
inline T cyl_neumann_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
{
return bessel_yn(v, x, pol);
}
template <class T, class Policy>
inline T sph_neumann_imp(unsigned v, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T sph_neumann_imp(unsigned v, T x, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names
static const char* function = "boost::math::sph_neumann<%1%>(%1%,%1%)";
constexpr auto function = "boost::math::sph_neumann<%1%>(%1%,%1%)";
//
// Nothing much to do here but check for errors, and
// evaluate the function's definition directly:
@@ -340,11 +419,11 @@ inline T sph_neumann_imp(unsigned v, T x, const Policy& pol)
}
template <class T, class Policy>
inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names, needed for floor.
static const char* function = "boost::math::cyl_bessel_j_zero<%1%>(%1%, int)";
constexpr auto function = "boost::math::cyl_bessel_j_zero<%1%>(%1%, int)";
const T half_epsilon(boost::math::tools::epsilon<T>() / 2U);
@@ -395,7 +474,7 @@ inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol)
const T guess_root = boost::math::detail::bessel_zero::cyl_bessel_j_zero_detail::initial_guess<T, Policy>((order_is_integer ? vv : v), m, pol);
// Select the maximum allowed iterations from the policy.
std::uintmax_t number_of_iterations = policies::get_max_root_iterations<Policy>();
boost::math::uintmax_t number_of_iterations = policies::get_max_root_iterations<Policy>();
const T delta_lo = ((guess_root > 0.2F) ? T(0.2) : T(guess_root / 2U));
@@ -418,11 +497,11 @@ inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol)
}
template <class T, class Policy>
inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names, needed for floor.
static const char* function = "boost::math::cyl_neumann_zero<%1%>(%1%, int)";
constexpr auto function = "boost::math::cyl_neumann_zero<%1%>(%1%, int)";
// Handle non-finite order.
if (!(boost::math::isfinite)(v) )
@@ -473,7 +552,7 @@ inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol)
const T guess_root = boost::math::detail::bessel_zero::cyl_neumann_zero_detail::initial_guess<T, Policy>(v, m, pol);
// Select the maximum allowed iterations from the policy.
std::uintmax_t number_of_iterations = policies::get_max_root_iterations<Policy>();
boost::math::uintmax_t number_of_iterations = policies::get_max_root_iterations<Policy>();
const T delta_lo = ((guess_root > 0.2F) ? T(0.2) : T(guess_root / 2U));
@@ -498,7 +577,7 @@ inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol)
} // namespace detail
template <class T1, class T2, class Policy>
inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j(T1 v, T2 x, const Policy& /* pol */)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j(T1 v, T2 x, const Policy& /* pol */)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -514,13 +593,13 @@ inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j(
}
template <class T1, class T2>
inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j(T1 v, T2 x)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j(T1 v, T2 x)
{
return cyl_bessel_j(v, x, policies::policy<>());
}
template <class T, class Policy>
inline typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel(unsigned v, T x, const Policy& /* pol */)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel(unsigned v, T x, const Policy& /* pol */)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T, T, Policy>::result_type result_type;
@@ -535,13 +614,13 @@ inline typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel(unsi
}
template <class T>
inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel(unsigned v, T x)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel(unsigned v, T x)
{
return sph_bessel(v, x, policies::policy<>());
}
template <class T1, class T2, class Policy>
inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i(T1 v, T2 x, const Policy& /* pol */)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i(T1 v, T2 x, const Policy& /* pol */)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -556,13 +635,13 @@ inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i(
}
template <class T1, class T2>
inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i(T1 v, T2 x)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i(T1 v, T2 x)
{
return cyl_bessel_i(v, x, policies::policy<>());
}
template <class T1, class T2, class Policy>
inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k(T1 v, T2 x, const Policy& /* pol */)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k(T1 v, T2 x, const Policy& /* pol */)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -578,13 +657,13 @@ inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k(
}
template <class T1, class T2>
inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k(T1 v, T2 x)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k(T1 v, T2 x)
{
return cyl_bessel_k(v, x, policies::policy<>());
}
template <class T1, class T2, class Policy>
inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann(T1 v, T2 x, const Policy& /* pol */)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann(T1 v, T2 x, const Policy& /* pol */)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -600,13 +679,13 @@ inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann(T
}
template <class T1, class T2>
inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann(T1 v, T2 x)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann(T1 v, T2 x)
{
return cyl_neumann(v, x, policies::policy<>());
}
template <class T, class Policy>
inline typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann(unsigned v, T x, const Policy& /* pol */)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann(unsigned v, T x, const Policy& /* pol */)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T, T, Policy>::result_type result_type;
@@ -621,13 +700,13 @@ inline typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann(uns
}
template <class T>
inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann(unsigned v, T x)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann(unsigned v, T x)
{
return sph_neumann(v, x, policies::policy<>());
}
template <class T, class Policy>
inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_bessel_j_zero(T v, int m, const Policy& /* pol */)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_bessel_j_zero(T v, int m, const Policy& /* pol */)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T, T, Policy>::result_type result_type;
@@ -639,35 +718,35 @@ inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_bessel_j_ze
policies::discrete_quantile<>,
policies::assert_undefined<> >::type forwarding_policy;
static_assert( false == std::numeric_limits<T>::is_specialized
|| ( true == std::numeric_limits<T>::is_specialized
&& false == std::numeric_limits<T>::is_integer),
static_assert( false == boost::math::numeric_limits<T>::is_specialized
|| ( true == boost::math::numeric_limits<T>::is_specialized
&& false == boost::math::numeric_limits<T>::is_integer),
"Order must be a floating-point type.");
return policies::checked_narrowing_cast<result_type, Policy>(detail::cyl_bessel_j_zero_imp<value_type>(v, m, forwarding_policy()), "boost::math::cyl_bessel_j_zero<%1%>(%1%,%1%)");
}
template <class T>
inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_bessel_j_zero(T v, int m)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_bessel_j_zero(T v, int m)
{
static_assert( false == std::numeric_limits<T>::is_specialized
|| ( true == std::numeric_limits<T>::is_specialized
&& false == std::numeric_limits<T>::is_integer),
static_assert( false == boost::math::numeric_limits<T>::is_specialized
|| ( true == boost::math::numeric_limits<T>::is_specialized
&& false == boost::math::numeric_limits<T>::is_integer),
"Order must be a floating-point type.");
return cyl_bessel_j_zero<T, policies::policy<> >(v, m, policies::policy<>());
}
template <class T, class OutputIterator, class Policy>
inline OutputIterator cyl_bessel_j_zero(T v,
BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_bessel_j_zero(T v,
int start_index,
unsigned number_of_zeros,
OutputIterator out_it,
const Policy& pol)
{
static_assert( false == std::numeric_limits<T>::is_specialized
|| ( true == std::numeric_limits<T>::is_specialized
&& false == std::numeric_limits<T>::is_integer),
static_assert( false == boost::math::numeric_limits<T>::is_specialized
|| ( true == boost::math::numeric_limits<T>::is_specialized
&& false == boost::math::numeric_limits<T>::is_integer),
"Order must be a floating-point type.");
for(int i = 0; i < static_cast<int>(number_of_zeros); ++i)
@@ -679,7 +758,7 @@ inline OutputIterator cyl_bessel_j_zero(T v,
}
template <class T, class OutputIterator>
inline OutputIterator cyl_bessel_j_zero(T v,
BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_bessel_j_zero(T v,
int start_index,
unsigned number_of_zeros,
OutputIterator out_it)
@@ -688,7 +767,7 @@ inline OutputIterator cyl_bessel_j_zero(T v,
}
template <class T, class Policy>
inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_neumann_zero(T v, int m, const Policy& /* pol */)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_neumann_zero(T v, int m, const Policy& /* pol */)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T, T, Policy>::result_type result_type;
@@ -700,35 +779,35 @@ inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_neumann_zer
policies::discrete_quantile<>,
policies::assert_undefined<> >::type forwarding_policy;
static_assert( false == std::numeric_limits<T>::is_specialized
|| ( true == std::numeric_limits<T>::is_specialized
&& false == std::numeric_limits<T>::is_integer),
static_assert( false == boost::math::numeric_limits<T>::is_specialized
|| ( true == boost::math::numeric_limits<T>::is_specialized
&& false == boost::math::numeric_limits<T>::is_integer),
"Order must be a floating-point type.");
return policies::checked_narrowing_cast<result_type, Policy>(detail::cyl_neumann_zero_imp<value_type>(v, m, forwarding_policy()), "boost::math::cyl_neumann_zero<%1%>(%1%,%1%)");
}
template <class T>
inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_neumann_zero(T v, int m)
BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_neumann_zero(T v, int m)
{
static_assert( false == std::numeric_limits<T>::is_specialized
|| ( true == std::numeric_limits<T>::is_specialized
&& false == std::numeric_limits<T>::is_integer),
static_assert( false == boost::math::numeric_limits<T>::is_specialized
|| ( true == boost::math::numeric_limits<T>::is_specialized
&& false == boost::math::numeric_limits<T>::is_integer),
"Order must be a floating-point type.");
return cyl_neumann_zero<T, policies::policy<> >(v, m, policies::policy<>());
}
template <class T, class OutputIterator, class Policy>
inline OutputIterator cyl_neumann_zero(T v,
BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_neumann_zero(T v,
int start_index,
unsigned number_of_zeros,
OutputIterator out_it,
const Policy& pol)
{
static_assert( false == std::numeric_limits<T>::is_specialized
|| ( true == std::numeric_limits<T>::is_specialized
&& false == std::numeric_limits<T>::is_integer),
static_assert( false == boost::math::numeric_limits<T>::is_specialized
|| ( true == boost::math::numeric_limits<T>::is_specialized
&& false == boost::math::numeric_limits<T>::is_integer),
"Order must be a floating-point type.");
for(int i = 0; i < static_cast<int>(number_of_zeros); ++i)
@@ -740,7 +819,7 @@ inline OutputIterator cyl_neumann_zero(T v,
}
template <class T, class OutputIterator>
inline OutputIterator cyl_neumann_zero(T v,
BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_neumann_zero(T v,
int start_index,
unsigned number_of_zeros,
OutputIterator out_it)

View File

@@ -13,6 +13,8 @@
#ifndef BOOST_MATH_AIRY_AI_BI_ZERO_2013_01_20_HPP_
#define BOOST_MATH_AIRY_AI_BI_ZERO_2013_01_20_HPP_
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/tuple.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/math/special_functions/cbrt.hpp>
@@ -21,18 +23,18 @@
{
// Forward declarations of the needed Airy function implementations.
template <class T, class Policy>
T airy_ai_imp(T x, const Policy& pol);
BOOST_MATH_GPU_ENABLED T airy_ai_imp(T x, const Policy& pol);
template <class T, class Policy>
T airy_bi_imp(T x, const Policy& pol);
BOOST_MATH_GPU_ENABLED T airy_bi_imp(T x, const Policy& pol);
template <class T, class Policy>
T airy_ai_prime_imp(T x, const Policy& pol);
BOOST_MATH_GPU_ENABLED T airy_ai_prime_imp(T x, const Policy& pol);
template <class T, class Policy>
T airy_bi_prime_imp(T x, const Policy& pol);
BOOST_MATH_GPU_ENABLED T airy_bi_prime_imp(T x, const Policy& pol);
namespace airy_zero
{
template<class T, class Policy>
T equation_as_10_4_105(const T& z, const Policy& pol)
BOOST_MATH_GPU_ENABLED T equation_as_10_4_105(const T& z, const Policy& pol)
{
const T one_over_z (T(1) / z);
const T one_over_z_squared(one_over_z * one_over_z);
@@ -54,7 +56,7 @@
namespace airy_ai_zero_detail
{
template<class T, class Policy>
T initial_guess(const int m, const Policy& pol)
BOOST_MATH_GPU_ENABLED T initial_guess(const int m, const Policy& pol)
{
T guess;
@@ -106,11 +108,19 @@
class function_object_ai_and_ai_prime
{
public:
explicit function_object_ai_and_ai_prime(const Policy& pol) : my_pol(pol) { }
BOOST_MATH_GPU_ENABLED explicit function_object_ai_and_ai_prime(const Policy& pol) : my_pol(pol) { }
function_object_ai_and_ai_prime(const function_object_ai_and_ai_prime&) = default;
#ifdef BOOST_MATH_ENABLE_CUDA
# pragma nv_diag_suppress 20012
#endif
boost::math::tuple<T, T> operator()(const T& x) const
BOOST_MATH_GPU_ENABLED function_object_ai_and_ai_prime(const function_object_ai_and_ai_prime&) = default;
#ifdef BOOST_MATH_ENABLE_CUDA
# pragma nv_diag_default 20012
#endif
BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(const T& x) const
{
// Return a tuple containing both Ai(x) and Ai'(x).
return boost::math::make_tuple(
@@ -127,7 +137,7 @@
namespace airy_bi_zero_detail
{
template<class T, class Policy>
T initial_guess(const int m, const Policy& pol)
BOOST_MATH_GPU_ENABLED T initial_guess(const int m, const Policy& pol)
{
T guess;
@@ -179,11 +189,19 @@
class function_object_bi_and_bi_prime
{
public:
explicit function_object_bi_and_bi_prime(const Policy& pol) : my_pol(pol) { }
BOOST_MATH_GPU_ENABLED explicit function_object_bi_and_bi_prime(const Policy& pol) : my_pol(pol) { }
function_object_bi_and_bi_prime(const function_object_bi_and_bi_prime&) = default;
#ifdef BOOST_MATH_ENABLE_CUDA
# pragma nv_diag_suppress 20012
#endif
BOOST_MATH_GPU_ENABLED function_object_bi_and_bi_prime(const function_object_bi_and_bi_prime&) = default;
#ifdef BOOST_MATH_ENABLE_CUDA
# pragma nv_diag_default 20012
#endif
boost::math::tuple<T, T> operator()(const T& x) const
BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(const T& x) const
{
// Return a tuple containing both Bi(x) and Bi'(x).
return boost::math::make_tuple(

View File

@@ -1,5 +1,6 @@
// Copyright (c) 2006 Xiaogang Zhang
// Copyright (c) 2017 John Maddock
// Copyright (c) 2024 Matt Borland
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -14,6 +15,9 @@
#include <boost/math/tools/rational.hpp>
#include <boost/math/tools/big_constant.hpp>
#include <boost/math/tools/assert.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/precision.hpp>
#if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
//
@@ -35,24 +39,24 @@
namespace boost { namespace math { namespace detail{
template <typename T>
T bessel_i0(const T& x);
BOOST_MATH_GPU_ENABLED T bessel_i0(const T& x);
template <typename T, int N>
T bessel_i0_imp(const T&, const std::integral_constant<int, N>&)
BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T&, const boost::math::integral_constant<int, N>&)
{
BOOST_MATH_ASSERT(0);
return 0;
}
template <typename T>
T bessel_i0_imp(const T& x, const std::integral_constant<int, 24>&)
BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant<int, 24>&)
{
BOOST_MATH_STD_USING
if(x < 7.75)
{
// Max error in interpolated form: 3.929e-08
// Max Error found at float precision = Poly: 1.991226e-07
static const float P[] = {
BOOST_MATH_STATIC const float P[] = {
1.00000003928615375e+00f,
2.49999576572179639e-01f,
2.77785268558399407e-02f,
@@ -70,7 +74,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 24>&)
{
// Max error in interpolated form: 5.195e-08
// Max Error found at float precision = Poly: 8.502534e-08
static const float P[] = {
BOOST_MATH_STATIC const float P[] = {
3.98942651588301770e-01f,
4.98327234176892844e-02f,
2.91866904423115499e-02f,
@@ -83,7 +87,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 24>&)
{
// Max error in interpolated form: 1.782e-09
// Max Error found at float precision = Poly: 6.473568e-08
static const float P[] = {
BOOST_MATH_STATIC const float P[] = {
3.98942391532752700e-01f,
4.98455950638200020e-02f,
2.94835666900682535e-02f
@@ -96,7 +100,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 24>&)
}
template <typename T>
T bessel_i0_imp(const T& x, const std::integral_constant<int, 53>&)
BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant<int, 53>&)
{
BOOST_MATH_STD_USING
if(x < 7.75)
@@ -104,7 +108,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 53>&)
// Bessel I0 over[10 ^ -16, 7.75]
// Max error in interpolated form : 3.042e-18
// Max Error found at double precision = Poly : 5.106609e-16 Cheb : 5.239199e-16
static const double P[] = {
BOOST_MATH_STATIC const double P[] = {
1.00000000000000000e+00,
2.49999999999999909e-01,
2.77777777777782257e-02,
@@ -128,7 +132,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 53>&)
{
// Max error in interpolated form : 1.685e-16
// Max Error found at double precision = Poly : 2.575063e-16 Cheb : 2.247615e+00
static const double P[] = {
BOOST_MATH_STATIC const double P[] = {
3.98942280401425088e-01,
4.98677850604961985e-02,
2.80506233928312623e-02,
@@ -158,7 +162,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 53>&)
{
// Max error in interpolated form : 2.437e-18
// Max Error found at double precision = Poly : 1.216719e-16
static const double P[] = {
BOOST_MATH_STATIC const double P[] = {
3.98942280401432905e-01,
4.98677850491434560e-02,
2.80506308916506102e-02,
@@ -173,7 +177,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 53>&)
}
template <typename T>
T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant<int, 64>&)
{
BOOST_MATH_STD_USING
if(x < 7.75)
@@ -182,7 +186,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
// Max error in interpolated form : 3.899e-20
// Max Error found at float80 precision = Poly : 1.770840e-19
// LCOV_EXCL_START
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, 9.99999999999999999961011629e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, 2.50000000000000001321873912e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, 2.77777777777777703400424216e-02),
@@ -211,8 +215,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
// Maximum Relative Change in Control Points : 1.631e-04
// Max Error found at float80 precision = Poly : 7.811948e-21
// LCOV_EXCL_START
static const T Y = 4.051098823547363281250e-01f;
static const T P[] = {
BOOST_MATH_STATIC const T Y = 4.051098823547363281250e-01f;
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, -6.158081780620616479492e-03),
BOOST_MATH_BIG_CONSTANT(T, 64, 4.883635969834048766148e-02),
BOOST_MATH_BIG_CONSTANT(T, 64, 7.892782002476195771920e-02),
@@ -237,8 +241,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
// Maximum Relative Change in Control Points : 1.304e-03
// Max Error found at float80 precision = Poly : 2.303527e-20
// LCOV_EXCL_START
static const T Y = 4.033188819885253906250e-01f;
static const T P[] = {
BOOST_MATH_STATIC const T Y = 4.033188819885253906250e-01f;
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, -4.376373876116109401062e-03),
BOOST_MATH_BIG_CONSTANT(T, 64, 4.982899138682911273321e-02),
BOOST_MATH_BIG_CONSTANT(T, 64, 3.109477529533515397644e-02),
@@ -262,8 +266,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
// Max error in interpolated form: 1.035e-21
// Max Error found at float80 precision = Poly: 1.885872e-21
// LCOV_EXCL_START
static const T Y = 4.011702537536621093750e-01f;
static const T P[] = {
BOOST_MATH_STATIC const T Y = 4.011702537536621093750e-01f;
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, -2.227973351806078464328e-03),
BOOST_MATH_BIG_CONSTANT(T, 64, 4.986778486088017419036e-02),
BOOST_MATH_BIG_CONSTANT(T, 64, 2.805066823812285310011e-02),
@@ -291,7 +295,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
// Max error in interpolated form : 5.587e-20
// Max Error found at float80 precision = Poly : 8.776852e-20
// LCOV_EXCL_START
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401432677955074061e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, 4.98677850501789875615574058e-02),
BOOST_MATH_BIG_CONSTANT(T, 64, 2.80506290908675604202206833e-02),
@@ -320,7 +324,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
}
template <typename T>
T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant<int, 113>&)
{
BOOST_MATH_STD_USING
if(x < 7.75)
@@ -329,7 +333,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
// Max error in interpolated form : 1.274e-34
// Max Error found at float128 precision = Poly : 3.096091e-34
// LCOV_EXCL_START
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 1.0000000000000000000000000000000001273856e+00),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.4999999999999999999999999999999107477496e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.7777777777777777777777777777881795230918e-02),
@@ -364,7 +368,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
// Max error in interpolated form : 7.534e-35
// Max Error found at float128 precision = Poly : 6.123912e-34
// LCOV_EXCL_START
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 9.9999999999999999992388573069504617493518e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.5000000000000000007304739268173096975340e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.7777777777777777744261405400543564492074e-02),
@@ -403,7 +407,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
// Max error in interpolated form : 1.808e-34
// Max Error found at float128 precision = Poly : 2.399403e-34
// LCOV_EXCL_START
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040870793650581242239624530714032e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867780576714783790784348982178607842250e-02),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.8051948347934462928487999569249907599510e-02),
@@ -445,7 +449,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
// Max error in interpolated form : 1.487e-34
// Max Error found at float128 precision = Poly : 1.929924e-34
// LCOV_EXCL_START
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793996798658172135362278e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867785050179084714910130342157246539820e-02),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.8050629090725751585266360464766768437048e-02),
@@ -480,7 +484,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
// Max error in interpolated form : 5.459e-35
// Max Error found at float128 precision = Poly : 1.472240e-34
// LCOV_EXCL_START
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793994605993438166526772e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867785050179084742493257495245185241487e-02),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.8050629090725735167652437695397756897920e-02),
@@ -507,33 +511,33 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
}
template <typename T>
T bessel_i0_imp(const T& x, const std::integral_constant<int, 0>&)
BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant<int, 0>&)
{
if(boost::math::tools::digits<T>() <= 24)
return bessel_i0_imp(x, std::integral_constant<int, 24>());
return bessel_i0_imp(x, boost::math::integral_constant<int, 24>());
else if(boost::math::tools::digits<T>() <= 53)
return bessel_i0_imp(x, std::integral_constant<int, 53>());
return bessel_i0_imp(x, boost::math::integral_constant<int, 53>());
else if(boost::math::tools::digits<T>() <= 64)
return bessel_i0_imp(x, std::integral_constant<int, 64>());
return bessel_i0_imp(x, boost::math::integral_constant<int, 64>());
else if(boost::math::tools::digits<T>() <= 113)
return bessel_i0_imp(x, std::integral_constant<int, 113>());
return bessel_i0_imp(x, boost::math::integral_constant<int, 113>());
BOOST_MATH_ASSERT(0);
return 0;
}
template <typename T>
inline T bessel_i0(const T& x)
BOOST_MATH_GPU_ENABLED inline T bessel_i0(const T& x)
{
typedef std::integral_constant<int,
((std::numeric_limits<T>::digits == 0) || (std::numeric_limits<T>::radix != 2)) ?
typedef boost::math::integral_constant<int,
((boost::math::numeric_limits<T>::digits == 0) || (boost::math::numeric_limits<T>::radix != 2)) ?
0 :
std::numeric_limits<T>::digits <= 24 ?
boost::math::numeric_limits<T>::digits <= 24 ?
24 :
std::numeric_limits<T>::digits <= 53 ?
boost::math::numeric_limits<T>::digits <= 53 ?
53 :
std::numeric_limits<T>::digits <= 64 ?
boost::math::numeric_limits<T>::digits <= 64 ?
64 :
std::numeric_limits<T>::digits <= 113 ?
boost::math::numeric_limits<T>::digits <= 113 ?
113 : -1
> tag_type;

View File

@@ -1,4 +1,5 @@
// Copyright (c) 2017 John Maddock
// Copyright (c) 2024 Matt Borland
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -17,9 +18,13 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/rational.hpp>
#include <boost/math/tools/big_constant.hpp>
#include <boost/math/tools/assert.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/precision.hpp>
#if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
//
@@ -38,24 +43,24 @@
namespace boost { namespace math { namespace detail{
template <typename T>
T bessel_i1(const T& x);
BOOST_MATH_GPU_ENABLED T bessel_i1(const T& x);
template <typename T, int N>
T bessel_i1_imp(const T&, const std::integral_constant<int, N>&)
BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T&, const boost::math::integral_constant<int, N>&)
{
BOOST_MATH_ASSERT(0);
return 0;
}
template <typename T>
T bessel_i1_imp(const T& x, const std::integral_constant<int, 24>&)
BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant<int, 24>&)
{
BOOST_MATH_STD_USING
if(x < 7.75)
{
//Max error in interpolated form : 1.348e-08
// Max Error found at float precision = Poly : 1.469121e-07
static const float P[] = {
BOOST_MATH_STATIC const float P[] = {
8.333333221e-02f,
6.944453712e-03f,
3.472097211e-04f,
@@ -74,7 +79,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 24>&)
// Max error in interpolated form: 9.000e-08
// Max Error found at float precision = Poly: 1.044345e-07
static const float P[] = {
BOOST_MATH_STATIC const float P[] = {
3.98942115977513013e-01f,
-1.49581264836620262e-01f,
-4.76475741878486795e-02f,
@@ -89,7 +94,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 24>&)
}
template <typename T>
T bessel_i1_imp(const T& x, const std::integral_constant<int, 53>&)
BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant<int, 53>&)
{
BOOST_MATH_STD_USING
if(x < 7.75)
@@ -98,7 +103,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 53>&)
// Max error in interpolated form: 5.639e-17
// Max Error found at double precision = Poly: 1.795559e-16
static const double P[] = {
BOOST_MATH_STATIC const double P[] = {
8.333333333333333803e-02,
6.944444444444341983e-03,
3.472222222225921045e-04,
@@ -122,7 +127,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 53>&)
// Max error in interpolated form: 1.796e-16
// Max Error found at double precision = Poly: 2.898731e-16
static const double P[] = {
BOOST_MATH_STATIC const double P[] = {
3.989422804014406054e-01,
-1.496033551613111533e-01,
-4.675104253598537322e-02,
@@ -152,7 +157,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 53>&)
{
// Max error in interpolated form: 1.320e-19
// Max Error found at double precision = Poly: 7.065357e-17
static const double P[] = {
BOOST_MATH_STATIC const double P[] = {
3.989422804014314820e-01,
-1.496033551467584157e-01,
-4.675105322571775911e-02,
@@ -167,7 +172,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 53>&)
}
template <typename T>
T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant<int, 64>&)
{
BOOST_MATH_STD_USING
if(x < 7.75)
@@ -175,7 +180,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
// Bessel I0 over[10 ^ -16, 7.75]
// Max error in interpolated form: 8.086e-21
// Max Error found at float80 precision = Poly: 7.225090e-20
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, 8.33333333333333333340071817e-02),
BOOST_MATH_BIG_CONSTANT(T, 64, 6.94444444444444442462728070e-03),
BOOST_MATH_BIG_CONSTANT(T, 64, 3.47222222222222318886683883e-04),
@@ -203,7 +208,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
// Maximum Deviation Found : 3.887e-20
// Expected Error Term : 3.887e-20
// Maximum Relative Change in Control Points : 1.681e-04
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942260530218897338680e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, -1.49599542849073670179540e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, -4.70492865454119188276875e-02),
@@ -236,7 +241,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
// Maximum Relative Change in Control Points : 2.101e-03
// Max Error found at float80 precision = Poly : 6.029974e-20
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401431675205845e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, -1.49603355149968887210170e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, -4.67510486284376330257260e-02),
@@ -258,7 +263,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
// Bessel I0 over[100, INF]
// Max error in interpolated form: 2.456e-20
// Max Error found at float80 precision = Poly: 5.446356e-20
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401432677958445e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, -1.49603355150537411254359e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, -4.67510484842456251368526e-02),
@@ -276,7 +281,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
}
template <typename T>
T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant<int, 113>&)
{
BOOST_MATH_STD_USING
if(x < 7.75)
@@ -285,7 +290,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
// Max error in interpolated form: 1.835e-35
// Max Error found at float128 precision = Poly: 1.645036e-34
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 8.3333333333333333333333333333333331804098e-02),
BOOST_MATH_BIG_CONSTANT(T, 113, 6.9444444444444444444444444444445418303082e-03),
BOOST_MATH_BIG_CONSTANT(T, 113, 3.4722222222222222222222222222119082346591e-04),
@@ -321,7 +326,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
// Maximum Relative Change in Control Points : 5.204e-03
// Max Error found at float128 precision = Poly : 2.882561e-34
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 8.333333333333333326889717360850080939e-02),
BOOST_MATH_BIG_CONSTANT(T, 113, 6.944444444444444511272790848815114507e-03),
BOOST_MATH_BIG_CONSTANT(T, 113, 3.472222222222221892451965054394153443e-04),
@@ -355,7 +360,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
// Maximum Deviation Found : 1.766e-35
// Expected Error Term : 1.021e-35
// Maximum Relative Change in Control Points : 6.228e-03
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 8.333333333333255774414858563409941233e-02),
BOOST_MATH_BIG_CONSTANT(T, 113, 6.944444444444897867884955912228700291e-03),
BOOST_MATH_BIG_CONSTANT(T, 113, 3.472222222220954970397343617150959467e-04),
@@ -389,7 +394,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
{
// Max error in interpolated form: 8.864e-36
// Max Error found at float128 precision = Poly: 8.522841e-35
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422793693152031514179994954750043e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, -1.496029423752889591425633234009799670e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, -4.682975926820553021482820043377990241e-02),
@@ -421,7 +426,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
// Max error in interpolated form: 6.028e-35
// Max Error found at float128 precision = Poly: 1.368313e-34
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422804012941975429616956496046931e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, -1.496033550576049830976679315420681402e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, -4.675107835141866009896710750800622147e-02),
@@ -456,7 +461,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
// Max error in interpolated form: 5.494e-35
// Max Error found at float128 precision = Poly: 1.214651e-34
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422804014326779399307367861631577e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, -1.496033551505372542086590873271571919e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, -4.675104848454290286276466276677172664e-02),
@@ -486,7 +491,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
// Bessel I0 over[100, INF]
// Max error in interpolated form: 6.081e-35
// Max Error found at float128 precision = Poly: 1.407151e-34
static const T P[] = {
BOOST_MATH_STATIC const T P[] = {
BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793994605993438200208417e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, -1.4960335515053725422747977247811372936584e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, -4.6751048484542891946087411826356811991039e-02),
@@ -512,33 +517,33 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
}
template <typename T>
T bessel_i1_imp(const T& x, const std::integral_constant<int, 0>&)
BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant<int, 0>&)
{
if(boost::math::tools::digits<T>() <= 24)
return bessel_i1_imp(x, std::integral_constant<int, 24>());
return bessel_i1_imp(x, boost::math::integral_constant<int, 24>());
else if(boost::math::tools::digits<T>() <= 53)
return bessel_i1_imp(x, std::integral_constant<int, 53>());
return bessel_i1_imp(x, boost::math::integral_constant<int, 53>());
else if(boost::math::tools::digits<T>() <= 64)
return bessel_i1_imp(x, std::integral_constant<int, 64>());
return bessel_i1_imp(x, boost::math::integral_constant<int, 64>());
else if(boost::math::tools::digits<T>() <= 113)
return bessel_i1_imp(x, std::integral_constant<int, 113>());
return bessel_i1_imp(x, boost::math::integral_constant<int, 113>());
BOOST_MATH_ASSERT(0);
return 0;
}
template <typename T>
inline T bessel_i1(const T& x)
inline BOOST_MATH_GPU_ENABLED T bessel_i1(const T& x)
{
typedef std::integral_constant<int,
((std::numeric_limits<T>::digits == 0) || (std::numeric_limits<T>::radix != 2)) ?
typedef boost::math::integral_constant<int,
((boost::math::numeric_limits<T>::digits == 0) || (boost::math::numeric_limits<T>::radix != 2)) ?
0 :
std::numeric_limits<T>::digits <= 24 ?
boost::math::numeric_limits<T>::digits <= 24 ?
24 :
std::numeric_limits<T>::digits <= 53 ?
boost::math::numeric_limits<T>::digits <= 53 ?
53 :
std::numeric_limits<T>::digits <= 64 ?
boost::math::numeric_limits<T>::digits <= 64 ?
64 :
std::numeric_limits<T>::digits <= 113 ?
boost::math::numeric_limits<T>::digits <= 113 ?
113 : -1
> tag_type;

View File

@@ -1,4 +1,5 @@
// Copyright (c) 2006 Xiaogang Zhang
// Copyright (c) 2024 Matt Borland
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,14 +11,17 @@
#pragma once
#endif
#include <cmath>
#include <cstdint>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/cstdint.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/series.hpp>
#include <boost/math/special_functions/sign.hpp>
#include <boost/math/special_functions/round.hpp>
#include <boost/math/special_functions/gamma.hpp>
#include <boost/math/special_functions/sin_pi.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/tools/config.hpp>
// Modified Bessel functions of the first and second kind of fractional order
@@ -30,13 +34,13 @@ struct cyl_bessel_i_small_z
{
typedef T result_type;
cyl_bessel_i_small_z(T v_, T z_) : k(0), v(v_), mult(z_*z_/4)
BOOST_MATH_GPU_ENABLED cyl_bessel_i_small_z(T v_, T z_) : k(0), v(v_), mult(z_*z_/4)
{
BOOST_MATH_STD_USING
term = 1;
}
T operator()()
BOOST_MATH_GPU_ENABLED T operator()()
{
T result = term;
++k;
@@ -52,7 +56,7 @@ private:
};
template <class T, class Policy>
inline T bessel_i_small_z_series(T v, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T bessel_i_small_z_series(T v, T x, const Policy& pol)
{
BOOST_MATH_STD_USING
T prefix;
@@ -69,7 +73,7 @@ inline T bessel_i_small_z_series(T v, T x, const Policy& pol)
return prefix;
cyl_bessel_i_small_z<T, Policy> s(v, x);
std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon<T, Policy>(), max_iter);
@@ -80,7 +84,7 @@ inline T bessel_i_small_z_series(T v, T x, const Policy& pol)
// Calculate K(v, x) and K(v+1, x) by method analogous to
// Temme, Journal of Computational Physics, vol 21, 343 (1976)
template <typename T, typename Policy>
int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol)
BOOST_MATH_GPU_ENABLED int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol)
{
T f, h, p, q, coef, sum, sum1, tolerance;
T a, b, c, d, sigma, gamma1, gamma2;
@@ -157,7 +161,7 @@ int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol)
// Evaluate continued fraction fv = I_(v+1) / I_v, derived from
// Abramowitz and Stegun, Handbook of Mathematical Functions, 1972, 9.1.73
template <typename T, typename Policy>
int CF1_ik(T v, T x, T* fv, const Policy& pol)
BOOST_MATH_GPU_ENABLED int CF1_ik(T v, T x, T* fv, const Policy& pol)
{
T C, D, f, a, b, delta, tiny, tolerance;
unsigned long k;
@@ -204,7 +208,7 @@ int CF1_ik(T v, T x, T* fv, const Policy& pol)
// z1 / z0 = U(v+1.5, 2v+1, 2x) / U(v+0.5, 2v+1, 2x), see
// Thompson and Barnett, Computer Physics Communications, vol 47, 245 (1987)
template <typename T, typename Policy>
int CF2_ik(T v, T x, T* Kv, T* Kv1, const Policy& pol)
BOOST_MATH_GPU_ENABLED int CF2_ik(T v, T x, T* Kv, T* Kv1, const Policy& pol)
{
BOOST_MATH_STD_USING
using namespace boost::math::constants;
@@ -297,7 +301,7 @@ enum{
// Compute I(v, x) and K(v, x) simultaneously by Temme's method, see
// Temme, Journal of Computational Physics, vol 19, 324 (1975)
template <typename T, typename Policy>
int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol)
BOOST_MATH_GPU_ENABLED int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol)
{
// Kv1 = K_(v+1), fv = I_(v+1) / I_v
// Ku1 = K_(u+1), fu = I_(u+1) / I_u
@@ -314,7 +318,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol)
using namespace boost::math::tools;
using namespace boost::math::constants;
static const char* function = "boost::math::bessel_ik<%1%>(%1%,%1%)";
constexpr auto function = "boost::math::bessel_ik<%1%>(%1%,%1%)";
if (v < 0)
{
@@ -329,7 +333,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol)
if (((kind & need_i) == 0) && (fabs(4 * v * v - 25) / (8 * x) < tools::forth_root_epsilon<T>()))
{
// A&S 9.7.2
Iv = std::numeric_limits<T>::quiet_NaN(); // any value will do
Iv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do
T mu = 4 * v * v;
T eight_z = 8 * x;
Kv = 1 + (mu - 1) / eight_z + (mu - 1) * (mu - 9) / (2 * eight_z * eight_z) + (mu - 1) * (mu - 9) * (mu - 25) / (6 * eight_z * eight_z * eight_z);
@@ -410,7 +414,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol)
}
}
else
Iv = std::numeric_limits<T>::quiet_NaN(); // any value will do
Iv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do
}
if (reflect)
{

View File

@@ -10,6 +10,7 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/math/tools/rational.hpp>
#include <boost/math/tools/big_constant.hpp>
@@ -32,10 +33,10 @@
namespace boost { namespace math { namespace detail{
template <typename T>
T bessel_j0(T x);
BOOST_MATH_GPU_ENABLED T bessel_j0(T x);
template <typename T>
T bessel_j0(T x)
BOOST_MATH_GPU_ENABLED T bessel_j0(T x)
{
#ifdef BOOST_MATH_INSTRUMENT
static bool b = false;
@@ -48,7 +49,7 @@ T bessel_j0(T x)
}
#endif
static const T P1[] = {
BOOST_MATH_STATIC const T P1[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -4.1298668500990866786e+11)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.7282507878605942706e+10)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -6.2140700423540120665e+08)),
@@ -57,7 +58,7 @@ T bessel_j0(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0344222815443188943e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2117036164593528341e-01))
};
static const T Q1[] = {
BOOST_MATH_STATIC const T Q1[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.3883787996332290397e+12)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.6328198300859648632e+10)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.3985097372263433271e+08)),
@@ -66,7 +67,7 @@ T bessel_j0(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0))
};
static const T P2[] = {
BOOST_MATH_STATIC const T P2[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8319397969392084011e+03)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2254078161378989535e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -7.2879702464464618998e+03)),
@@ -76,7 +77,7 @@ T bessel_j0(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.4321196680624245801e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.8591703355916499363e+01))
};
static const T Q2[] = {
BOOST_MATH_STATIC const T Q2[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.5783478026152301072e+05)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4599102262586308984e+05)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.4055062591169562211e+04)),
@@ -86,7 +87,7 @@ T bessel_j0(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -2.5258076240801555057e+01)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
};
static const T PC[] = {
BOOST_MATH_STATIC const T PC[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684302e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1345386639580765797e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1170523380864944322e+04)),
@@ -94,7 +95,7 @@ T bessel_j0(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5376201909008354296e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.8961548424210455236e-01))
};
static const T QC[] = {
BOOST_MATH_STATIC const T QC[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684318e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1370412495510416640e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1215350561880115730e+04)),
@@ -102,7 +103,7 @@ T bessel_j0(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5711159858080893649e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
};
static const T PS[] = {
BOOST_MATH_STATIC const T PS[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.9226600200800094098e+01)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8591953644342993800e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1183429920482737611e+02)),
@@ -110,7 +111,7 @@ T bessel_j0(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2441026745835638459e+00)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.8033303048680751817e-03))
};
static const T QS[] = {
BOOST_MATH_STATIC const T QS[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.7105024128512061905e+03)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1951131543434613647e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.2642780169211018836e+03)),
@@ -118,12 +119,13 @@ T bessel_j0(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 9.0593769594993125859e+01)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
};
static const T x1 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4048255576957727686e+00)),
x2 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5200781102863106496e+00)),
x11 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.160e+02)),
x12 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.42444230422723137837e-03)),
x21 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4130e+03)),
x22 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.46860286310649596604e-04));
BOOST_MATH_STATIC const T x1 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4048255576957727686e+00));
BOOST_MATH_STATIC const T x2 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5200781102863106496e+00));
BOOST_MATH_STATIC const T x11 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.160e+02));
BOOST_MATH_STATIC const T x12 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.42444230422723137837e-03));
BOOST_MATH_STATIC const T x21 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4130e+03));
BOOST_MATH_STATIC const T x22 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.46860286310649596604e-04));
T value, factor, r, rc, rs;

View File

@@ -10,6 +10,7 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/math/tools/rational.hpp>
#include <boost/math/tools/big_constant.hpp>
@@ -32,27 +33,29 @@
namespace boost { namespace math{ namespace detail{
template <typename T>
T bessel_j1(T x);
BOOST_MATH_GPU_ENABLED T bessel_j1(T x);
template <class T>
struct bessel_j1_initializer
{
struct init
{
init()
BOOST_MATH_GPU_ENABLED init()
{
do_init();
}
static void do_init()
BOOST_MATH_GPU_ENABLED static void do_init()
{
bessel_j1(T(1));
}
void force_instantiate()const{}
BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
};
static const init initializer;
static void force_instantiate()
BOOST_MATH_STATIC const init initializer;
BOOST_MATH_GPU_ENABLED static void force_instantiate()
{
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
initializer.force_instantiate();
#endif
}
};
@@ -60,11 +63,11 @@ template <class T>
const typename bessel_j1_initializer<T>::init bessel_j1_initializer<T>::initializer;
template <typename T>
T bessel_j1(T x)
BOOST_MATH_GPU_ENABLED T bessel_j1(T x)
{
bessel_j1_initializer<T>::force_instantiate();
static const T P1[] = {
BOOST_MATH_STATIC const T P1[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4258509801366645672e+11)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6781041261492395835e+09)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1548696764841276794e+08)),
@@ -73,7 +76,7 @@ T bessel_j1(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0650724020080236441e+01)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.0767857011487300348e-02))
};
static const T Q1[] = {
BOOST_MATH_STATIC const T Q1[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1868604460820175290e+12)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.2091902282580133541e+10)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.0228375140097033958e+08)),
@@ -82,7 +85,7 @@ T bessel_j1(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0))
};
static const T P2[] = {
BOOST_MATH_STATIC const T P2[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.7527881995806511112e+16)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.6608531731299018674e+15)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.6658018905416665164e+13)),
@@ -92,7 +95,7 @@ T bessel_j1(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -7.5023342220781607561e+03)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.6179191852758252278e+00))
};
static const T Q2[] = {
BOOST_MATH_STATIC const T Q2[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7253905888447681194e+18)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7128800897135812012e+16)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.4899346165481429307e+13)),
@@ -102,7 +105,7 @@ T bessel_j1(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.3886978985861357615e+03)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
};
static const T PC[] = {
BOOST_MATH_STATIC const T PC[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278571e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9422465050776411957e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -6.6033732483649391093e+06)),
@@ -111,7 +114,7 @@ T bessel_j1(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.6116166443246101165e+03)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0))
};
static const T QC[] = {
BOOST_MATH_STATIC const T QC[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278568e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9341243899345856590e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -6.5853394797230870728e+06)),
@@ -120,7 +123,7 @@ T bessel_j1(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4550094401904961825e+03)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
};
static const T PS[] = {
BOOST_MATH_STATIC const T PS[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.3220913409857223519e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.5145160675335701966e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6178836581270835179e+04)),
@@ -129,7 +132,7 @@ T bessel_j1(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.5265133846636032186e+01)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0))
};
static const T QS[] = {
BOOST_MATH_STATIC const T QS[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0871281941028743574e+05)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8194580422439972989e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4194606696037208929e+06)),
@@ -138,12 +141,13 @@ T bessel_j1(T x)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.6383677696049909675e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
};
static const T x1 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.8317059702075123156e+00)),
x2 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0155866698156187535e+00)),
x11 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 9.810e+02)),
x12 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.2527979248768438556e-04)),
x21 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7960e+03)),
x22 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.8330184381246462950e-05));
BOOST_MATH_STATIC const T x1 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.8317059702075123156e+00));
BOOST_MATH_STATIC const T x2 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0155866698156187535e+00));
BOOST_MATH_STATIC const T x11 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 9.810e+02));
BOOST_MATH_STATIC const T x12 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.2527979248768438556e-04));
BOOST_MATH_STATIC const T x21 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7960e+03));
BOOST_MATH_STATIC const T x22 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.8330184381246462950e-05));
T value, factor, r, rc, rs, w;

View File

@@ -10,6 +10,10 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/assert.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/special_functions/gamma.hpp>
#include <boost/math/special_functions/detail/bessel_j0.hpp>
#include <boost/math/special_functions/detail/bessel_j1.hpp>
#include <boost/math/special_functions/detail/bessel_jy.hpp>
@@ -24,7 +28,7 @@
namespace boost { namespace math { namespace detail{
template <typename T, typename Policy>
T bessel_jn(int n, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED T bessel_jn(int n, T x, const Policy& pol)
{
T value(0), factor, current, prev, next;

View File

@@ -11,16 +11,18 @@
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/special_functions/gamma.hpp>
#include <boost/math/special_functions/sign.hpp>
#include <boost/math/special_functions/hypot.hpp>
#include <boost/math/special_functions/sin_pi.hpp>
#include <boost/math/special_functions/cos_pi.hpp>
#include <boost/math/special_functions/round.hpp>
#include <boost/math/special_functions/detail/bessel_jy_asym.hpp>
#include <boost/math/special_functions/detail/bessel_jy_series.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <complex>
// Bessel functions of the first and second kind of fractional order
@@ -38,7 +40,7 @@ namespace boost { namespace math {
// try it and see...
//
template <class T, class Policy>
bool hankel_PQ(T v, T x, T* p, T* q, const Policy& )
BOOST_MATH_GPU_ENABLED bool hankel_PQ(T v, T x, T* p, T* q, const Policy& )
{
BOOST_MATH_STD_USING
T tolerance = 2 * policies::get_epsilon<T, Policy>();
@@ -70,7 +72,7 @@ namespace boost { namespace math {
// Calculate Y(v, x) and Y(v+1, x) by Temme's method, see
// Temme, Journal of Computational Physics, vol 21, 343 (1976)
template <typename T, typename Policy>
int temme_jy(T v, T x, T* Y, T* Y1, const Policy& pol)
BOOST_MATH_GPU_ENABLED int temme_jy(T v, T x, T* Y, T* Y1, const Policy& pol)
{
T g, h, p, q, f, coef, sum, sum1, tolerance;
T a, d, e, sigma;
@@ -139,7 +141,7 @@ namespace boost { namespace math {
// Evaluate continued fraction fv = J_(v+1) / J_v, see
// Abramowitz and Stegun, Handbook of Mathematical Functions, 1972, 9.1.73
template <typename T, typename Policy>
int CF1_jy(T v, T x, T* fv, int* sign, const Policy& pol)
BOOST_MATH_GPU_ENABLED int CF1_jy(T v, T x, T* fv, int* sign, const Policy& pol)
{
T C, D, f, a, b, delta, tiny, tolerance;
unsigned long k;
@@ -185,7 +187,7 @@ namespace boost { namespace math {
// real values only.
//
template <typename T, typename Policy>
int CF2_jy(T v, T x, T* p, T* q, const Policy& pol)
BOOST_MATH_GPU_ENABLED int CF2_jy(T v, T x, T* p, T* q, const Policy& pol)
{
BOOST_MATH_STD_USING
@@ -254,13 +256,13 @@ namespace boost { namespace math {
return 0;
}
static const int need_j = 1;
static const int need_y = 2;
BOOST_MATH_STATIC const int need_j = 1;
BOOST_MATH_STATIC const int need_y = 2;
// Compute J(v, x) and Y(v, x) simultaneously by Steed's method, see
// Barnett et al, Computer Physics Communications, vol 8, 377 (1974)
template <typename T, typename Policy>
int bessel_jy(T v, T x, T* J, T* Y, int kind, const Policy& pol)
BOOST_MATH_GPU_ENABLED int bessel_jy(T v, T x, T* J, T* Y, int kind, const Policy& pol)
{
BOOST_MATH_ASSERT(x >= 0);
@@ -273,7 +275,7 @@ namespace boost { namespace math {
T cp = 0;
T sp = 0;
static const char* function = "boost::math::bessel_jy<%1%>(%1%,%1%)";
constexpr auto function = "boost::math::bessel_jy<%1%>(%1%,%1%)";
BOOST_MATH_STD_USING
using namespace boost::math::tools;
@@ -284,7 +286,7 @@ namespace boost { namespace math {
reflect = true;
v = -v; // v is non-negative from here
}
if (v > static_cast<T>((std::numeric_limits<int>::max)()))
if (v > static_cast<T>((boost::math::numeric_limits<int>::max)()))
{
*J = *Y = policies::raise_evaluation_error<T>(function, "Order of Bessel function is too large to evaluate: got %1%", v, pol);
return 1; // LCOV_EXCL_LINE previous line will throw.
@@ -310,10 +312,10 @@ namespace boost { namespace math {
else if(kind & need_j)
*J = policies::raise_domain_error<T>(function, "Value of Bessel J_v(x) is complex-infinity at %1%", x, pol); // complex infinity
else
*J = std::numeric_limits<T>::quiet_NaN(); // LCOV_EXCL_LINE, we should never get here, any value will do, not using J.
*J = boost::math::numeric_limits<T>::quiet_NaN(); // LCOV_EXCL_LINE, we should never get here, any value will do, not using J.
if((kind & need_y) == 0)
*Y = std::numeric_limits<T>::quiet_NaN(); // any value will do, not using Y.
*Y = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do, not using Y.
else
{
// We shoud never get here:
@@ -333,7 +335,7 @@ namespace boost { namespace math {
// and divergent which leads to large errors :-(
//
Jv = bessel_j_small_z_series(v, x, pol);
Yv = std::numeric_limits<T>::quiet_NaN();
Yv = boost::math::numeric_limits<T>::quiet_NaN();
}
else if((x < 1) && (u != 0) && (log(policies::get_epsilon<T, Policy>() / 2) > v * log((x/2) * (x/2) / v)))
{
@@ -344,7 +346,7 @@ namespace boost { namespace math {
if(kind&need_j)
Jv = bessel_j_small_z_series(v, x, pol);
else
Jv = std::numeric_limits<T>::quiet_NaN();
Jv = boost::math::numeric_limits<T>::quiet_NaN();
if((org_kind&need_y && (!reflect || (cp != 0)))
|| (org_kind & need_j && (reflect && (sp != 0))))
{
@@ -352,7 +354,7 @@ namespace boost { namespace math {
Yv = bessel_y_small_z_series(v, x, &Yv_scale, pol);
}
else
Yv = std::numeric_limits<T>::quiet_NaN();
Yv = boost::math::numeric_limits<T>::quiet_NaN();
}
else if((u == 0) && (x < policies::get_epsilon<T, Policy>()))
{
@@ -363,7 +365,7 @@ namespace boost { namespace math {
if(kind&need_j)
Jv = bessel_j_small_z_series(v, x, pol);
else
Jv = std::numeric_limits<T>::quiet_NaN();
Jv = boost::math::numeric_limits<T>::quiet_NaN();
if((org_kind&need_y && (!reflect || (cp != 0)))
|| (org_kind & need_j && (reflect && (sp != 0))))
{
@@ -371,7 +373,7 @@ namespace boost { namespace math {
Yv = bessel_yn_small_z(n, x, &Yv_scale, pol);
}
else
Yv = std::numeric_limits<T>::quiet_NaN();
Yv = boost::math::numeric_limits<T>::quiet_NaN();
// LCOV_EXCL_STOP
}
else if(asymptotic_bessel_large_x_limit(v, x))
@@ -381,13 +383,13 @@ namespace boost { namespace math {
Yv = asymptotic_bessel_y_large_x_2(v, x, pol);
}
else
Yv = std::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
Yv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
if(kind&need_j)
{
Jv = asymptotic_bessel_j_large_x_2(v, x, pol);
}
else
Jv = std::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
Jv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
}
else if((x > 8) && hankel_PQ(v, x, &p, &q, pol))
{
@@ -449,7 +451,7 @@ namespace boost { namespace math {
Jv = scale * W / (Yv * fv - Yv1); // Wronskian relation
}
else
Jv = std::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
Jv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
Yv_scale = scale;
}
else // x in (2, \infty)
@@ -564,7 +566,7 @@ namespace boost { namespace math {
Yv = prev;
}
else
Yv = std::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
Yv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
}
if (reflect)

View File

@@ -16,12 +16,15 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/math/special_functions/factorials.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
namespace boost{ namespace math{ namespace detail{
template <class T>
inline T asymptotic_bessel_amplitude(T v, T x)
BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_amplitude(T v, T x)
{
// Calculate the amplitude of J(v, x) and Y(v, x) for large
// x: see A&S 9.2.28.
@@ -39,7 +42,7 @@ inline T asymptotic_bessel_amplitude(T v, T x)
}
template <class T>
T asymptotic_bessel_phase_mx(T v, T x)
BOOST_MATH_GPU_ENABLED T asymptotic_bessel_phase_mx(T v, T x)
{
//
// Calculate the phase of J(v, x) and Y(v, x) for large x.
@@ -63,7 +66,7 @@ T asymptotic_bessel_phase_mx(T v, T x)
}
template <class T, class Policy>
inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol)
{
// See A&S 9.2.19.
BOOST_MATH_STD_USING
@@ -93,7 +96,7 @@ inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol)
}
template <class T, class Policy>
inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol)
{
// See A&S 9.2.19.
BOOST_MATH_STD_USING
@@ -124,7 +127,7 @@ inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol)
}
template <class T>
inline bool asymptotic_bessel_large_x_limit(int v, const T& x)
BOOST_MATH_GPU_ENABLED inline bool asymptotic_bessel_large_x_limit(int v, const T& x)
{
BOOST_MATH_STD_USING
//
@@ -142,7 +145,7 @@ inline bool asymptotic_bessel_large_x_limit(int v, const T& x)
}
template <class T>
inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x)
BOOST_MATH_GPU_ENABLED inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x)
{
BOOST_MATH_STD_USING
//
@@ -155,11 +158,11 @@ inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x)
// error rates either side of the divide for v < 10000.
// At double precision eps^1/8 ~= 0.01.
//
return (std::max)(T(fabs(v)), T(1)) < x * sqrt(tools::forth_root_epsilon<T>());
return BOOST_MATH_GPU_SAFE_MAX(T(fabs(v)), T(1)) < x * sqrt(tools::forth_root_epsilon<T>());
}
template <class T, class Policy>
void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol)
BOOST_MATH_GPU_ENABLED void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol)
{
T c = 1;
T p = (v / boost::math::sin_pi(v, pol)) * pow(x / 2, -v) / boost::math::tgamma(1 - v, pol);
@@ -193,7 +196,7 @@ void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol)
}
template <class T, class Policy>
T asymptotic_bessel_i_large_x(T v, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED T asymptotic_bessel_i_large_x(T v, T x, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names
T s = 1;

View File

@@ -10,10 +10,9 @@
#pragma once
#endif
#include <cmath>
#include <cstdint>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/assert.hpp>
#include <boost/math/tools/cstdint.hpp>
namespace boost { namespace math { namespace detail{
@@ -22,7 +21,7 @@ struct bessel_j_small_z_series_term
{
typedef T result_type;
bessel_j_small_z_series_term(T v_, T x)
BOOST_MATH_GPU_ENABLED bessel_j_small_z_series_term(T v_, T x)
: N(0), v(v_)
{
BOOST_MATH_STD_USING
@@ -30,7 +29,7 @@ struct bessel_j_small_z_series_term
mult *= -mult;
term = 1;
}
T operator()()
BOOST_MATH_GPU_ENABLED T operator()()
{
T r = term;
++N;
@@ -49,7 +48,7 @@ private:
// Converges rapidly for all z << v.
//
template <class T, class Policy>
inline T bessel_j_small_z_series(T v, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T bessel_j_small_z_series(T v, T x, const Policy& pol)
{
BOOST_MATH_STD_USING
T prefix;
@@ -66,7 +65,7 @@ inline T bessel_j_small_z_series(T v, T x, const Policy& pol)
return prefix;
bessel_j_small_z_series_term<T, Policy> s(v, x);
std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon<T, Policy>(), max_iter);
@@ -79,7 +78,7 @@ struct bessel_y_small_z_series_term_a
{
typedef T result_type;
bessel_y_small_z_series_term_a(T v_, T x)
BOOST_MATH_GPU_ENABLED bessel_y_small_z_series_term_a(T v_, T x)
: N(0), v(v_)
{
BOOST_MATH_STD_USING
@@ -87,7 +86,7 @@ struct bessel_y_small_z_series_term_a
mult *= -mult;
term = 1;
}
T operator()()
BOOST_MATH_GPU_ENABLED T operator()()
{
BOOST_MATH_STD_USING
T r = term;
@@ -107,7 +106,7 @@ struct bessel_y_small_z_series_term_b
{
typedef T result_type;
bessel_y_small_z_series_term_b(T v_, T x)
BOOST_MATH_GPU_ENABLED bessel_y_small_z_series_term_b(T v_, T x)
: N(0), v(v_)
{
BOOST_MATH_STD_USING
@@ -115,7 +114,7 @@ struct bessel_y_small_z_series_term_b
mult *= -mult;
term = 1;
}
T operator()()
BOOST_MATH_GPU_ENABLED T operator()()
{
T r = term;
++N;
@@ -138,10 +137,10 @@ private:
// eps/2 * v^v(x/2)^-v > (x/2)^v or log(eps/2) > v log((x/2)^2/v)
//
template <class T, class Policy>
inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol)
{
BOOST_MATH_STD_USING
static const char* function = "bessel_y_small_z_series<%1%>(%1%,%1%)";
constexpr auto function = "bessel_y_small_z_series<%1%>(%1%,%1%)";
T prefix;
T gam;
T p = log(x / 2);
@@ -183,7 +182,7 @@ inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol)
prefix = -exp(prefix);
}
bessel_y_small_z_series_term_a<T, Policy> s(v, x);
std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
*pscale = scale;
T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon<T, Policy>(), max_iter);
@@ -211,7 +210,7 @@ inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol)
}
template <class T, class Policy>
T bessel_yn_small_z(int n, T z, T* scale, const Policy& pol)
BOOST_MATH_GPU_ENABLED T bessel_yn_small_z(int n, T z, T* scale, const Policy& pol)
{
//
// See http://functions.wolfram.com/Bessel-TypeFunctions/BesselY/06/01/04/01/02/

View File

@@ -18,19 +18,30 @@
#ifndef BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_
#define BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_
#include <algorithm>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/tuple.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/tools/cstdint.hpp>
#include <boost/math/tools/roots.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/cbrt.hpp>
#include <boost/math/special_functions/detail/airy_ai_bi_zero.hpp>
#ifndef BOOST_MATH_HAS_NVRTC
#include <boost/math/special_functions/math_fwd.hpp>
#endif
#ifdef BOOST_MATH_ENABLE_CUDA
# pragma nv_diag_suppress 20012
#endif
namespace boost { namespace math {
namespace detail
{
namespace bessel_zero
{
template<class T>
T equation_nist_10_21_19(const T& v, const T& a)
BOOST_MATH_GPU_ENABLED T equation_nist_10_21_19(const T& v, const T& a)
{
// Get the initial estimate of the m'th root of Jv or Yv.
// This subroutine is used for the order m with m > 1.
@@ -57,11 +68,11 @@
class equation_as_9_3_39_and_its_derivative
{
public:
explicit equation_as_9_3_39_and_its_derivative(const T& zt) : zeta(zt) { }
BOOST_MATH_GPU_ENABLED explicit equation_as_9_3_39_and_its_derivative(const T& zt) : zeta(zt) { }
equation_as_9_3_39_and_its_derivative(const equation_as_9_3_39_and_its_derivative&) = default;
BOOST_MATH_GPU_ENABLED equation_as_9_3_39_and_its_derivative(const equation_as_9_3_39_and_its_derivative&) = default;
boost::math::tuple<T, T> operator()(const T& z) const
BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(const T& z) const
{
BOOST_MATH_STD_USING // ADL of std names, needed for acos, sqrt.
@@ -86,7 +97,7 @@
};
template<class T, class Policy>
static T equation_as_9_5_26(const T& v, const T& ai_bi_root, const Policy& pol)
BOOST_MATH_GPU_ENABLED T equation_as_9_5_26(const T& v, const T& ai_bi_root, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names, needed for log, sqrt.
@@ -132,9 +143,9 @@
// Select the maximum allowed iterations based on the number
// of decimal digits in the numeric type T, being at least 12.
const auto iterations_allowed = static_cast<std::uintmax_t>((std::max)(12, my_digits10 * 2));
const auto iterations_allowed = static_cast<boost::math::uintmax_t>(BOOST_MATH_GPU_SAFE_MAX(12, my_digits10 * 2));
std::uintmax_t iterations_used = iterations_allowed;
boost::math::uintmax_t iterations_used = iterations_allowed;
// Calculate the root of z as a function of zeta.
const T z = boost::math::tools::newton_raphson_iterate(
@@ -142,7 +153,7 @@
z_estimate,
range_zmin,
range_zmax,
(std::min)(boost::math::tools::digits<T>(), boost::math::tools::digits<float>()),
BOOST_MATH_GPU_SAFE_MIN(boost::math::tools::digits<T>(), boost::math::tools::digits<float>()),
iterations_used);
static_cast<void>(iterations_used);
@@ -168,7 +179,7 @@
namespace cyl_bessel_j_zero_detail
{
template<class T, class Policy>
T equation_nist_10_21_40_a(const T& v, const Policy& pol)
BOOST_MATH_GPU_ENABLED T equation_nist_10_21_40_a(const T& v, const Policy& pol)
{
const T v_pow_third(boost::math::cbrt(v, pol));
const T v_pow_minus_two_thirds(T(1) / (v_pow_third * v_pow_third));
@@ -185,13 +196,13 @@
class function_object_jv
{
public:
function_object_jv(const T& v,
BOOST_MATH_GPU_ENABLED function_object_jv(const T& v,
const Policy& pol) : my_v(v),
my_pol(pol) { }
function_object_jv(const function_object_jv&) = default;
BOOST_MATH_GPU_ENABLED function_object_jv(const function_object_jv&) = default;
T operator()(const T& x) const
BOOST_MATH_GPU_ENABLED T operator()(const T& x) const
{
return boost::math::cyl_bessel_j(my_v, x, my_pol);
}
@@ -206,15 +217,16 @@
class function_object_jv_and_jv_prime
{
public:
function_object_jv_and_jv_prime(const T& v,
const bool order_is_zero,
const Policy& pol) : my_v(v),
BOOST_MATH_GPU_ENABLED function_object_jv_and_jv_prime(
const T& v,
const bool order_is_zero,
const Policy& pol) : my_v(v),
my_order_is_zero(order_is_zero),
my_pol(pol) { }
function_object_jv_and_jv_prime(const function_object_jv_and_jv_prime&) = default;
boost::math::tuple<T, T> operator()(const T& x) const
BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(const T& x) const
{
// Obtain Jv(x) and Jv'(x).
// Chris's original code called the Bessel function implementation layer direct,
@@ -246,10 +258,10 @@
const function_object_jv_and_jv_prime& operator=(const function_object_jv_and_jv_prime&) = delete;
};
template<class T> bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; }
template<class T> BOOST_MATH_GPU_ENABLED bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; }
template<class T, class Policy>
T initial_guess(const T& v, const int m, const Policy& pol)
BOOST_MATH_GPU_ENABLED T initial_guess(const T& v, const int m, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names, needed for floor.
@@ -325,7 +337,7 @@
}
// Perform several steps of bisection iteration to refine the guess.
std::uintmax_t number_of_iterations(12U);
boost::math::uintmax_t number_of_iterations(12U);
// Do the bisection iteration.
const boost::math::tuple<T, T> guess_pair =
@@ -390,7 +402,7 @@
namespace cyl_neumann_zero_detail
{
template<class T, class Policy>
T equation_nist_10_21_40_b(const T& v, const Policy& pol)
BOOST_MATH_GPU_ENABLED T equation_nist_10_21_40_b(const T& v, const Policy& pol)
{
const T v_pow_third(boost::math::cbrt(v, pol));
const T v_pow_minus_two_thirds(T(1) / (v_pow_third * v_pow_third));
@@ -407,13 +419,13 @@
class function_object_yv
{
public:
function_object_yv(const T& v,
const Policy& pol) : my_v(v),
my_pol(pol) { }
BOOST_MATH_GPU_ENABLED function_object_yv(const T& v,
const Policy& pol) : my_v(v),
my_pol(pol) { }
function_object_yv(const function_object_yv&) = default;
BOOST_MATH_GPU_ENABLED function_object_yv(const function_object_yv&) = default;
T operator()(const T& x) const
BOOST_MATH_GPU_ENABLED T operator()(const T& x) const
{
return boost::math::cyl_neumann(my_v, x, my_pol);
}
@@ -428,13 +440,13 @@
class function_object_yv_and_yv_prime
{
public:
function_object_yv_and_yv_prime(const T& v,
const Policy& pol) : my_v(v),
my_pol(pol) { }
BOOST_MATH_GPU_ENABLED function_object_yv_and_yv_prime(const T& v,
const Policy& pol) : my_v(v),
my_pol(pol) { }
function_object_yv_and_yv_prime(const function_object_yv_and_yv_prime&) = default;
BOOST_MATH_GPU_ENABLED function_object_yv_and_yv_prime(const function_object_yv_and_yv_prime&) = default;
boost::math::tuple<T, T> operator()(const T& x) const
BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(const T& x) const
{
const T half_epsilon(boost::math::tools::epsilon<T>() / 2U);
@@ -469,10 +481,10 @@
const function_object_yv_and_yv_prime& operator=(const function_object_yv_and_yv_prime&) = delete;
};
template<class T> bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; }
template<class T> BOOST_MATH_GPU_ENABLED bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; }
template<class T, class Policy>
T initial_guess(const T& v, const int m, const Policy& pol)
BOOST_MATH_GPU_ENABLED T initial_guess(const T& v, const int m, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names, needed for floor.
@@ -560,7 +572,7 @@
}
// Perform several steps of bisection iteration to refine the guess.
std::uintmax_t number_of_iterations(12U);
boost::math::uintmax_t number_of_iterations(12U);
// Do the bisection iteration.
const boost::math::tuple<T, T> guess_pair =
@@ -624,4 +636,8 @@
} // namespace bessel_zero
} } } // namespace boost::math::detail
#ifdef BOOST_MATH_ENABLE_CUDA
# pragma nv_diag_default 20012
#endif
#endif // BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_

View File

@@ -13,10 +13,14 @@
#pragma warning(disable:4702) // Unreachable code (release mode only warning)
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/tools/rational.hpp>
#include <boost/math/tools/big_constant.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/tools/assert.hpp>
#include <boost/math/policies/error_handling.hpp>
#if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
//
@@ -44,35 +48,37 @@
namespace boost { namespace math { namespace detail{
template <typename T>
T bessel_k0(const T& x);
BOOST_MATH_GPU_ENABLED T bessel_k0(const T& x);
template <class T, class tag>
struct bessel_k0_initializer
{
struct init
{
init()
BOOST_MATH_GPU_ENABLED init()
{
do_init(tag());
}
static void do_init(const std::integral_constant<int, 113>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
{
bessel_k0(T(0.5));
bessel_k0(T(1.5));
}
static void do_init(const std::integral_constant<int, 64>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
{
bessel_k0(T(0.5));
bessel_k0(T(1.5));
}
template <class U>
static void do_init(const U&){}
void force_instantiate()const{}
BOOST_MATH_GPU_ENABLED static void do_init(const U&){}
BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
};
static const init initializer;
static void force_instantiate()
BOOST_MATH_STATIC const init initializer;
BOOST_MATH_GPU_ENABLED static void force_instantiate()
{
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
initializer.force_instantiate();
#endif
}
};
@@ -81,14 +87,14 @@ const typename bessel_k0_initializer<T, tag>::init bessel_k0_initializer<T, tag>
template <typename T, int N>
T bessel_k0_imp(const T&, const std::integral_constant<int, N>&)
BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T&, const boost::math::integral_constant<int, N>&)
{
BOOST_MATH_ASSERT(0);
return 0;
}
template <typename T>
T bessel_k0_imp(const T& x, const std::integral_constant<int, 24>&)
BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant<int, 24>&)
{
BOOST_MATH_STD_USING
if(x <= 1)
@@ -97,14 +103,14 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 24>&)
// Expected Error Term : -2.358e-09
// Maximum Relative Change in Control Points : 9.552e-02
// Max Error found at float precision = Poly : 4.448220e-08
static const T Y = 1.137250900268554688f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1.137250900268554688f;
BOOST_MATH_STATIC const T P[] =
{
-1.372508979104259711e-01f,
2.622545986273687617e-01f,
5.047103728247919836e-03f
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
1.000000000000000000e+00f,
-8.928694018000029415e-02f,
@@ -117,7 +123,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 24>&)
// Expected Error Term : -1.343e-09
// Maximum Relative Change in Control Points : 2.405e-02
// Max Error found at float precision = Poly : 1.354814e-07
static const T P2[] = {
BOOST_MATH_STATIC const T P2[] = {
1.159315158e-01f,
2.789828686e-01f,
2.524902861e-02f,
@@ -133,14 +139,14 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 24>&)
// Maximum Relative Change in Control Points : 9.064e-02
// Max Error found at float precision = Poly : 5.065020e-08
static const T P[] =
BOOST_MATH_STATIC const T P[] =
{
2.533141220e-01f,
5.221502603e-01f,
6.380180669e-02f,
-5.934976547e-02f
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
1.000000000e+00f,
2.679722431e+00f,
@@ -158,7 +164,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 24>&)
}
template <typename T>
T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant<int, 53>&)
{
BOOST_MATH_STD_USING
if(x <= 1)
@@ -167,8 +173,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
// Expected Error Term : -6.077e-17
// Maximum Relative Change in Control Points : 7.797e-02
// Max Error found at double precision = Poly : 1.003156e-16
static const T Y = 1.137250900268554688;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1.137250900268554688;
BOOST_MATH_STATIC const T P[] =
{
-1.372509002685546267e-01,
2.574916117833312855e-01,
@@ -176,7 +182,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
5.445476986653926759e-04,
7.125159422136622118e-06
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
1.000000000000000000e+00,
-5.458333438017788530e-02,
@@ -191,7 +197,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
// Expected Error Term : 3.392e-18
// Maximum Relative Change in Control Points : 2.041e-02
// Max Error found at double precision = Poly : 2.513112e-16
static const T P2[] =
BOOST_MATH_STATIC const T P2[] =
{
1.159315156584124484e-01,
2.789828789146031732e-01,
@@ -212,8 +218,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
// Maximum Relative Change in Control Points : 2.757e-01
// Max Error found at double precision = Poly : 1.001560e-16
static const T Y = 1;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1;
BOOST_MATH_STATIC const T P[] =
{
2.533141373155002416e-01,
3.628342133984595192e+00,
@@ -225,7 +231,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
-1.414237994269995877e+00,
-9.369168119754924625e-02
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
1.000000000000000000e+00,
1.494194694879908328e+01,
@@ -248,7 +254,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
}
template <typename T>
T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant<int, 64>&)
{
BOOST_MATH_STD_USING
if(x <= 1)
@@ -257,8 +263,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
// Expected Error Term : 2.180e-22
// Maximum Relative Change in Control Points : 2.943e-01
// Max Error found at float80 precision = Poly : 3.923207e-20
static const T Y = 1.137250900268554687500e+00;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1.137250900268554687500e+00;
BOOST_MATH_STATIC const T P[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, -1.372509002685546875002e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, 2.566481981037407600436e-01),
@@ -267,7 +273,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
BOOST_MATH_BIG_CONSTANT(T, 64, 1.213747930378196492543e-05),
BOOST_MATH_BIG_CONSTANT(T, 64, 9.423709328020389560844e-08)
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 64, -4.843828412587773008342e-02),
@@ -284,7 +290,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
// Expected Error Term : -2.434e-21
// Maximum Relative Change in Control Points : 2.459e-02
// Max Error found at float80 precision = Poly : 1.482487e-19
static const T P2[] =
BOOST_MATH_STATIC const T P2[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, 1.159315156584124488110e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, 2.764832791416047889734e-01),
@@ -292,7 +298,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
BOOST_MATH_BIG_CONSTANT(T, 64, 3.660777862036966089410e-04),
BOOST_MATH_BIG_CONSTANT(T, 64, 2.094942446930673386849e-06)
};
static const T Q2[] =
BOOST_MATH_STATIC const T Q2[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 64, -2.156100313881251616320e-02),
@@ -308,8 +314,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
// Expected Error Term : 2.236e-21
// Maximum Relative Change in Control Points : 3.021e-01
//Max Error found at float80 precision = Poly : 8.727378e-20
static const T Y = 1;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1;
BOOST_MATH_STATIC const T P[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, 2.533141373155002512056e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, 5.417942070721928652715e+00),
@@ -323,7 +329,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
BOOST_MATH_BIG_CONSTANT(T, 64, -4.059789241612946683713e+00),
BOOST_MATH_BIG_CONSTANT(T, 64, -1.612783121537333908889e-01)
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 64, 2.200669254769325861404e+01),
@@ -348,7 +354,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
}
template <typename T>
T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant<int, 113>&)
{
BOOST_MATH_STD_USING
if(x <= 1)
@@ -357,8 +363,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
// Expected Error Term : 5.682e-37
// Maximum Relative Change in Control Points : 6.094e-04
// Max Error found at float128 precision = Poly : 5.338213e-35
static const T Y = 1.137250900268554687500000000000000000e+00f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1.137250900268554687500000000000000000e+00f;
BOOST_MATH_STATIC const T P[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, -1.372509002685546875000000000000000006e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.556212905071072782462974351698081303e-01),
@@ -369,7 +375,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
BOOST_MATH_BIG_CONSTANT(T, 113, 1.752489221949580551692915881999762125e-09),
BOOST_MATH_BIG_CONSTANT(T, 113, 5.243010555737173524710512824955368526e-12)
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 113, -4.095631064064621099785696980653193721e-02),
@@ -387,7 +393,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
// Expected Error Term : 5.105e-38
// Maximum Relative Change in Control Points : 9.734e-03
// Max Error found at float128 precision = Poly : 1.688806e-34
static const T P2[] =
BOOST_MATH_STATIC const T P2[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, 1.159315156584124488107200313757741370e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.789828789146031122026800078439435369e-01),
@@ -413,8 +419,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
// Expected Error Term : 4.917e-40
// Maximum Relative Change in Control Points : 3.385e-01
// Max Error found at float128 precision = Poly : 1.567573e-34
static const T Y = 1;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1;
BOOST_MATH_STATIC const T P[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, 2.533141373155002512078826424055226265e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.001949740768235770078339977110749204e+01),
@@ -439,7 +445,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
BOOST_MATH_BIG_CONSTANT(T, 113, -4.201632288615609937883545928660649813e+03),
BOOST_MATH_BIG_CONSTANT(T, 113, -3.690820607338480548346746717311811406e+01)
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 113, 7.964877874035741452203497983642653107e+01),
@@ -475,33 +481,33 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
}
template <typename T>
T bessel_k0_imp(const T& x, const std::integral_constant<int, 0>&)
BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant<int, 0>&)
{
if(boost::math::tools::digits<T>() <= 24)
return bessel_k0_imp(x, std::integral_constant<int, 24>());
return bessel_k0_imp(x, boost::math::integral_constant<int, 24>());
else if(boost::math::tools::digits<T>() <= 53)
return bessel_k0_imp(x, std::integral_constant<int, 53>());
return bessel_k0_imp(x, boost::math::integral_constant<int, 53>());
else if(boost::math::tools::digits<T>() <= 64)
return bessel_k0_imp(x, std::integral_constant<int, 64>());
return bessel_k0_imp(x, boost::math::integral_constant<int, 64>());
else if(boost::math::tools::digits<T>() <= 113)
return bessel_k0_imp(x, std::integral_constant<int, 113>());
return bessel_k0_imp(x, boost::math::integral_constant<int, 113>());
BOOST_MATH_ASSERT(0);
return 0;
}
template <typename T>
inline T bessel_k0(const T& x)
BOOST_MATH_GPU_ENABLED inline T bessel_k0(const T& x)
{
typedef std::integral_constant<int,
((std::numeric_limits<T>::digits == 0) || (std::numeric_limits<T>::radix != 2)) ?
typedef boost::math::integral_constant<int,
((boost::math::numeric_limits<T>::digits == 0) || (boost::math::numeric_limits<T>::radix != 2)) ?
0 :
std::numeric_limits<T>::digits <= 24 ?
boost::math::numeric_limits<T>::digits <= 24 ?
24 :
std::numeric_limits<T>::digits <= 53 ?
boost::math::numeric_limits<T>::digits <= 53 ?
53 :
std::numeric_limits<T>::digits <= 64 ?
boost::math::numeric_limits<T>::digits <= 64 ?
64 :
std::numeric_limits<T>::digits <= 113 ?
boost::math::numeric_limits<T>::digits <= 113 ?
113 : -1
> tag_type;

View File

@@ -13,6 +13,10 @@
#pragma warning(disable:4702) // Unreachable code (release mode only warning)
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/tools/rational.hpp>
#include <boost/math/tools/big_constant.hpp>
#include <boost/math/policies/error_handling.hpp>
@@ -44,36 +48,38 @@
namespace boost { namespace math { namespace detail{
template <typename T>
T bessel_k1(const T&);
BOOST_MATH_GPU_ENABLED T bessel_k1(const T&);
template <class T, class tag>
struct bessel_k1_initializer
{
struct init
{
init()
BOOST_MATH_GPU_ENABLED init()
{
do_init(tag());
}
static void do_init(const std::integral_constant<int, 113>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
{
bessel_k1(T(0.5));
bessel_k1(T(2));
bessel_k1(T(6));
}
static void do_init(const std::integral_constant<int, 64>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
{
bessel_k1(T(0.5));
bessel_k1(T(6));
}
template <class U>
static void do_init(const U&) {}
void force_instantiate()const {}
BOOST_MATH_GPU_ENABLED static void do_init(const U&) {}
BOOST_MATH_GPU_ENABLED void force_instantiate()const {}
};
static const init initializer;
static void force_instantiate()
BOOST_MATH_STATIC const init initializer;
BOOST_MATH_GPU_ENABLED static void force_instantiate()
{
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
initializer.force_instantiate();
#endif
}
};
@@ -82,14 +88,14 @@ namespace boost { namespace math { namespace detail{
template <typename T, int N>
inline T bessel_k1_imp(const T&, const std::integral_constant<int, N>&)
inline BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T&, const boost::math::integral_constant<int, N>&)
{
BOOST_MATH_ASSERT(0);
return 0;
}
template <typename T>
T bessel_k1_imp(const T& x, const std::integral_constant<int, 24>&)
BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant<int, 24>&)
{
BOOST_MATH_STD_USING
if(x <= 1)
@@ -98,14 +104,14 @@ namespace boost { namespace math { namespace detail{
// Expected Error Term : -3.053e-12
// Maximum Relative Change in Control Points : 4.927e-02
// Max Error found at float precision = Poly : 7.918347e-10
static const T Y = 8.695471287e-02f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 8.695471287e-02f;
BOOST_MATH_STATIC const T P[] =
{
-3.621379531e-03f,
7.131781976e-03f,
-1.535278300e-05f
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
1.000000000e+00f,
-5.173102701e-02f,
@@ -118,7 +124,7 @@ namespace boost { namespace math { namespace detail{
// Maximum Deviation Found: 3.556e-08
// Expected Error Term : -3.541e-08
// Maximum Relative Change in Control Points : 8.203e-02
static const T P2[] =
BOOST_MATH_STATIC const T P2[] =
{
-3.079657469e-01f,
-8.537108913e-02f,
@@ -134,15 +140,15 @@ namespace boost { namespace math { namespace detail{
// Expected Error Term : -3.227e-08
// Maximum Relative Change in Control Points : 9.917e-02
// Max Error found at float precision = Poly : 6.084411e-08
static const T Y = 1.450342178f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1.450342178f;
BOOST_MATH_STATIC const T P[] =
{
-1.970280088e-01f,
2.188747807e-02f,
7.270394756e-01f,
2.490678196e-01f
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
1.000000000e+00f,
2.274292882e+00f,
@@ -160,7 +166,7 @@ namespace boost { namespace math { namespace detail{
}
template <typename T>
T bessel_k1_imp(const T& x, const std::integral_constant<int, 53>&)
BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant<int, 53>&)
{
BOOST_MATH_STD_USING
if(x <= 1)
@@ -169,15 +175,15 @@ namespace boost { namespace math { namespace detail{
// Expected Error Term : 1.921e-17
// Maximum Relative Change in Control Points : 5.287e-03
// Max Error found at double precision = Poly : 2.004747e-17
static const T Y = 8.69547128677368164e-02f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 8.69547128677368164e-02f;
BOOST_MATH_STATIC const T P[] =
{
-3.62137953440350228e-03,
7.11842087490330300e-03,
1.00302560256614306e-05,
1.77231085381040811e-06
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
1.00000000000000000e+00,
-4.80414794429043831e-02,
@@ -193,14 +199,14 @@ namespace boost { namespace math { namespace detail{
// Maximum Relative Change in Control Points : 3.103e-04
// Max Error found at double precision = Poly : 1.246698e-16
static const T P2[] =
BOOST_MATH_STATIC const T P2[] =
{
-3.07965757829206184e-01,
-7.80929703673074907e-02,
-2.70619343754051620e-03,
-2.49549522229072008e-05
};
static const T Q2[] =
BOOST_MATH_STATIC const T Q2[] =
{
1.00000000000000000e+00,
-2.36316836412163098e-02,
@@ -217,8 +223,8 @@ namespace boost { namespace math { namespace detail{
// Maximum Relative Change in Control Points : 2.786e-01
// Max Error found at double precision = Poly : 1.258798e-16
static const T Y = 1.45034217834472656f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1.45034217834472656f;
BOOST_MATH_STATIC const T P[] =
{
-1.97028041029226295e-01,
-2.32408961548087617e+00,
@@ -230,7 +236,7 @@ namespace boost { namespace math { namespace detail{
6.62582288933739787e+00,
3.08851840645286691e-01
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
1.00000000000000000e+00,
1.41811409298826118e+01,
@@ -253,7 +259,7 @@ namespace boost { namespace math { namespace detail{
}
template <typename T>
T bessel_k1_imp(const T& x, const std::integral_constant<int, 64>&)
BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant<int, 64>&)
{
BOOST_MATH_STD_USING
if(x <= 1)
@@ -262,8 +268,8 @@ namespace boost { namespace math { namespace detail{
// Expected Error Term : -5.548e-23
// Maximum Relative Change in Control Points : 2.002e-03
// Max Error found at float80 precision = Poly : 9.352785e-22
static const T Y = 8.695471286773681640625e-02f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 8.695471286773681640625e-02f;
BOOST_MATH_STATIC const T P[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, -3.621379534403483072861e-03),
BOOST_MATH_BIG_CONSTANT(T, 64, 7.102135866103952705932e-03),
@@ -271,7 +277,7 @@ namespace boost { namespace math { namespace detail{
BOOST_MATH_BIG_CONSTANT(T, 64, 2.537484002571894870830e-06),
BOOST_MATH_BIG_CONSTANT(T, 64, 6.603228256820000135990e-09)
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 64, -4.354457194045068370363e-02),
@@ -287,7 +293,7 @@ namespace boost { namespace math { namespace detail{
// Expected Error Term : 1.995e-23
// Maximum Relative Change in Control Points : 8.174e-04
// Max Error found at float80 precision = Poly : 4.137325e-20
static const T P2[] =
BOOST_MATH_STATIC const T P2[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, -3.079657578292062244054e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, -7.963049154965966503231e-02),
@@ -295,7 +301,7 @@ namespace boost { namespace math { namespace detail{
BOOST_MATH_BIG_CONSTANT(T, 64, -4.023052834702215699504e-05),
BOOST_MATH_BIG_CONSTANT(T, 64, -1.719459155018493821839e-07)
};
static const T Q2[] =
BOOST_MATH_STATIC const T Q2[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 64, -1.863917670410152669768e-02),
@@ -312,8 +318,8 @@ namespace boost { namespace math { namespace detail{
// Expected Error Term : -3.302e-21
// Maximum Relative Change in Control Points : 3.432e-01
// Max Error found at float80 precision = Poly : 1.083755e-19
static const T Y = 1.450342178344726562500e+00f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1.450342178344726562500e+00f;
BOOST_MATH_STATIC const T P[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, -1.970280410292263112917e-01),
BOOST_MATH_BIG_CONSTANT(T, 64, -4.058564803062959169322e+00),
@@ -328,7 +334,7 @@ namespace boost { namespace math { namespace detail{
BOOST_MATH_BIG_CONSTANT(T, 64, 4.319614662598089438939e+00),
BOOST_MATH_BIG_CONSTANT(T, 64, 3.710715864316521856193e-02)
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 64, 2.298433045824439052398e+01),
@@ -353,7 +359,7 @@ namespace boost { namespace math { namespace detail{
}
template <typename T>
T bessel_k1_imp(const T& x, const std::integral_constant<int, 113>&)
BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant<int, 113>&)
{
BOOST_MATH_STD_USING
if(x <= 1)
@@ -362,8 +368,8 @@ namespace boost { namespace math { namespace detail{
// Expected Error Term : -7.119e-35
// Maximum Relative Change in Control Points : 1.207e-03
// Max Error found at float128 precision = Poly : 7.143688e-35
static const T Y = 8.695471286773681640625000000000000000e-02f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 8.695471286773681640625000000000000000e-02f;
BOOST_MATH_STATIC const T P[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, -3.621379534403483072916666666666595475e-03),
BOOST_MATH_BIG_CONSTANT(T, 113, 7.074117676930975433219826471336547627e-03),
@@ -373,7 +379,7 @@ namespace boost { namespace math { namespace detail{
BOOST_MATH_BIG_CONSTANT(T, 113, 2.347140307321161346703214099534250263e-10),
BOOST_MATH_BIG_CONSTANT(T, 113, 5.569608494081482873946791086435679661e-13)
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 113, -3.580768910152105375615558920428350204e-02),
@@ -391,7 +397,7 @@ namespace boost { namespace math { namespace detail{
// Expected Error Term : 4.473e-37
// Maximum Relative Change in Control Points : 8.550e-04
// Max Error found at float128 precision = Poly : 8.167701e-35
static const T P2[] =
BOOST_MATH_STATIC const T P2[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, -3.079657578292062244053600156878870690e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, -8.133183745732467770755578848987414875e-02),
@@ -401,7 +407,7 @@ namespace boost { namespace math { namespace detail{
BOOST_MATH_BIG_CONSTANT(T, 113, -1.632502325880313239698965376754406011e-09),
BOOST_MATH_BIG_CONSTANT(T, 113, -2.311973065898784812266544485665624227e-12)
};
static const T Q2[] =
BOOST_MATH_STATIC const T Q2[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 113, -1.311471216733781016657962995723287450e-02),
@@ -418,8 +424,8 @@ namespace boost { namespace math { namespace detail{
{
// Max error in interpolated form: 5.307e-37
// Max Error found at float128 precision = Poly: 7.087862e-35
static const T Y = 1.5023040771484375f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1.5023040771484375f;
BOOST_MATH_STATIC const T P[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, -2.489899398329369710528254347931380044e-01),
BOOST_MATH_BIG_CONSTANT(T, 113, -6.819080211203854781858815596508456873e+00),
@@ -438,7 +444,7 @@ namespace boost { namespace math { namespace detail{
BOOST_MATH_BIG_CONSTANT(T, 113, 1.039705646510167437971862966128055524e+00),
BOOST_MATH_BIG_CONSTANT(T, 113, 1.008418100718254816100425022904039530e-02)
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.927456835239137986889227412815459529e+01),
@@ -465,8 +471,8 @@ namespace boost { namespace math { namespace detail{
// Expected Error Term : -6.565e-40
// Maximum Relative Change in Control Points : 1.880e-01
// Max Error found at float128 precision = Poly : 2.943572e-35
static const T Y = 1.308816909790039062500000000000000000f;
static const T P[] =
BOOST_MATH_STATIC const T Y = 1.308816909790039062500000000000000000f;
BOOST_MATH_STATIC const T P[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, -5.550277247453881129211735759447737350e-02),
BOOST_MATH_BIG_CONSTANT(T, 113, -3.485883080219574328217554864956175929e+00),
@@ -486,7 +492,7 @@ namespace boost { namespace math { namespace detail{
BOOST_MATH_BIG_CONSTANT(T, 113, 8.981057433937398731355768088809437625e+05),
BOOST_MATH_BIG_CONSTANT(T, 113, 2.519440069856232098711793483639792952e+04)
};
static const T Q[] =
BOOST_MATH_STATIC const T Q[] =
{
BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
BOOST_MATH_BIG_CONSTANT(T, 113, 7.127348248283623146544565916604103560e+01),
@@ -517,33 +523,33 @@ namespace boost { namespace math { namespace detail{
}
template <typename T>
T bessel_k1_imp(const T& x, const std::integral_constant<int, 0>&)
BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant<int, 0>&)
{
if(boost::math::tools::digits<T>() <= 24)
return bessel_k1_imp(x, std::integral_constant<int, 24>());
return bessel_k1_imp(x, boost::math::integral_constant<int, 24>());
else if(boost::math::tools::digits<T>() <= 53)
return bessel_k1_imp(x, std::integral_constant<int, 53>());
return bessel_k1_imp(x, boost::math::integral_constant<int, 53>());
else if(boost::math::tools::digits<T>() <= 64)
return bessel_k1_imp(x, std::integral_constant<int, 64>());
return bessel_k1_imp(x, boost::math::integral_constant<int, 64>());
else if(boost::math::tools::digits<T>() <= 113)
return bessel_k1_imp(x, std::integral_constant<int, 113>());
return bessel_k1_imp(x, boost::math::integral_constant<int, 113>());
BOOST_MATH_ASSERT(0);
return 0;
}
template <typename T>
inline T bessel_k1(const T& x)
template <typename T>
inline BOOST_MATH_GPU_ENABLED T bessel_k1(const T& x)
{
typedef std::integral_constant<int,
((std::numeric_limits<T>::digits == 0) || (std::numeric_limits<T>::radix != 2)) ?
typedef boost::math::integral_constant<int,
((boost::math::numeric_limits<T>::digits == 0) || (boost::math::numeric_limits<T>::radix != 2)) ?
0 :
std::numeric_limits<T>::digits <= 24 ?
boost::math::numeric_limits<T>::digits <= 24 ?
24 :
std::numeric_limits<T>::digits <= 53 ?
boost::math::numeric_limits<T>::digits <= 53 ?
53 :
std::numeric_limits<T>::digits <= 64 ?
boost::math::numeric_limits<T>::digits <= 64 ?
64 :
std::numeric_limits<T>::digits <= 113 ?
boost::math::numeric_limits<T>::digits <= 113 ?
113 : -1
> tag_type;

View File

@@ -10,8 +10,12 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/special_functions/detail/bessel_k0.hpp>
#include <boost/math/special_functions/detail/bessel_k1.hpp>
#include <boost/math/special_functions/sign.hpp>
#include <boost/math/policies/error_handling.hpp>
// Modified Bessel function of the second kind of integer order
@@ -20,14 +24,14 @@
namespace boost { namespace math { namespace detail{
template <typename T, typename Policy>
T bessel_kn(int n, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED T bessel_kn(int n, T x, const Policy& pol)
{
BOOST_MATH_STD_USING
T value, current, prev;
using namespace boost::math::tools;
static const char* function = "boost::math::bessel_kn<%1%>(%1%,%1%)";
constexpr auto function = "boost::math::bessel_kn<%1%>(%1%,%1%)";
if (x < 0)
{

View File

@@ -12,6 +12,7 @@
#pragma warning(disable:4702) // Unreachable code (release mode only warning)
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/detail/bessel_j0.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/math/tools/rational.hpp>
@@ -36,12 +37,12 @@
namespace boost { namespace math { namespace detail{
template <typename T, typename Policy>
T bessel_y0(T x, const Policy&);
BOOST_MATH_GPU_ENABLED T bessel_y0(T x, const Policy&);
template <typename T, typename Policy>
T bessel_y0(T x, const Policy&)
BOOST_MATH_GPU_ENABLED T bessel_y0(T x, const Policy&)
{
static const T P1[] = {
BOOST_MATH_STATIC const T P1[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0723538782003176831e+11)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.3716255451260504098e+09)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.0422274357376619816e+08)),
@@ -49,7 +50,7 @@ T bessel_y0(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0102532948020907590e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8402381979244993524e+01)),
};
static const T Q1[] = {
BOOST_MATH_STATIC const T Q1[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.8873865738997033405e+11)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.1617187777290363573e+09)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5662956624278251596e+07)),
@@ -57,7 +58,7 @@ T bessel_y0(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6475986689240190091e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
};
static const T P2[] = {
BOOST_MATH_STATIC const T P2[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -2.2213976967566192242e+13)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -5.5107435206722644429e+11)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.3600098638603061642e+10)),
@@ -66,7 +67,7 @@ T bessel_y0(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4566865832663635920e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7427031242901594547e+01)),
};
static const T Q2[] = {
BOOST_MATH_STATIC const T Q2[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.3386146580707264428e+14)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4266824419412347550e+12)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.4015103849971240096e+10)),
@@ -75,7 +76,7 @@ T bessel_y0(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.3030857612070288823e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
};
static const T P3[] = {
BOOST_MATH_STATIC const T P3[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.0728726905150210443e+15)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.7016641869173237784e+14)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2829912364088687306e+11)),
@@ -85,7 +86,7 @@ T bessel_y0(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1363534169313901632e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.7439661319197499338e+01)),
};
static const T Q3[] = {
BOOST_MATH_STATIC const T Q3[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.4563724628846457519e+17)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.9272425569640309819e+15)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2598377924042897629e+13)),
@@ -95,7 +96,7 @@ T bessel_y0(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.7903362168128450017e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
};
static const T PC[] = {
BOOST_MATH_STATIC const T PC[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684302e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1345386639580765797e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1170523380864944322e+04)),
@@ -103,7 +104,7 @@ T bessel_y0(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5376201909008354296e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.8961548424210455236e-01)),
};
static const T QC[] = {
BOOST_MATH_STATIC const T QC[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684318e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1370412495510416640e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1215350561880115730e+04)),
@@ -111,7 +112,7 @@ T bessel_y0(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5711159858080893649e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
};
static const T PS[] = {
BOOST_MATH_STATIC const T PS[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.9226600200800094098e+01)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8591953644342993800e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1183429920482737611e+02)),
@@ -119,7 +120,7 @@ T bessel_y0(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2441026745835638459e+00)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.8033303048680751817e-03)),
};
static const T QS[] = {
BOOST_MATH_STATIC const T QS[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.7105024128512061905e+03)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1951131543434613647e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.2642780169211018836e+03)),
@@ -127,7 +128,7 @@ T bessel_y0(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 9.0593769594993125859e+01)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
};
static const T x1 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.9357696627916752158e-01)),
BOOST_MATH_STATIC const T x1 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.9357696627916752158e-01)),
x2 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.9576784193148578684e+00)),
x3 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0860510603017726976e+00)),
x11 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.280e+02)),

View File

@@ -12,6 +12,7 @@
#pragma warning(disable:4702) // Unreachable code (release mode only warning)
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/detail/bessel_j1.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/math/tools/rational.hpp>
@@ -36,12 +37,12 @@
namespace boost { namespace math { namespace detail{
template <typename T, typename Policy>
T bessel_y1(T x, const Policy&);
BOOST_MATH_GPU_ENABLED T bessel_y1(T x, const Policy&);
template <typename T, typename Policy>
T bessel_y1(T x, const Policy&)
BOOST_MATH_GPU_ENABLED T bessel_y1(T x, const Policy&)
{
static const T P1[] = {
BOOST_MATH_STATIC const T P1[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.0535726612579544093e+13)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4708611716525426053e+12)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.7595974497819597599e+11)),
@@ -50,7 +51,7 @@ T bessel_y1(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2157953222280260820e+05)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.1714424660046133456e+02)),
};
static const T Q1[] = {
BOOST_MATH_STATIC const T Q1[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.0737873921079286084e+14)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1272286200406461981e+12)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.7800352738690585613e+10)),
@@ -59,7 +60,7 @@ T bessel_y1(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.2079908168393867438e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
};
static const T P2[] = {
BOOST_MATH_STATIC const T P2[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1514276357909013326e+19)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -5.6808094574724204577e+18)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -2.3638408497043134724e+16)),
@@ -70,7 +71,7 @@ T bessel_y1(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.9153806858264202986e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2337180442012953128e+03)),
};
static const T Q2[] = {
BOOST_MATH_STATIC const T Q2[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.3321844313316185697e+20)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.6968198822857178911e+18)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.0837179548112881950e+16)),
@@ -81,7 +82,7 @@ T bessel_y1(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.2855164849321609336e+03)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
};
static const T PC[] = {
BOOST_MATH_STATIC const T PC[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278571e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9422465050776411957e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -6.6033732483649391093e+06)),
@@ -90,7 +91,7 @@ T bessel_y1(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.6116166443246101165e+03)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)),
};
static const T QC[] = {
BOOST_MATH_STATIC const T QC[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278568e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9341243899345856590e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -6.5853394797230870728e+06)),
@@ -99,7 +100,7 @@ T bessel_y1(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4550094401904961825e+03)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
};
static const T PS[] = {
BOOST_MATH_STATIC const T PS[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.3220913409857223519e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.5145160675335701966e+04)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6178836581270835179e+04)),
@@ -108,7 +109,7 @@ T bessel_y1(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.5265133846636032186e+01)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)),
};
static const T QS[] = {
BOOST_MATH_STATIC const T QS[] = {
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0871281941028743574e+05)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8194580422439972989e+06)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4194606696037208929e+06)),
@@ -117,7 +118,7 @@ T bessel_y1(T x, const Policy&)
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.6383677696049909675e+02)),
static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
};
static const T x1 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1971413260310170351e+00)),
BOOST_MATH_STATIC const T x1 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1971413260310170351e+00)),
x2 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4296810407941351328e+00)),
x11 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.620e+02)),
x12 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8288260310170351490e-03)),

View File

@@ -10,9 +10,11 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/detail/bessel_y0.hpp>
#include <boost/math/special_functions/detail/bessel_y1.hpp>
#include <boost/math/special_functions/detail/bessel_jy_series.hpp>
#include <boost/math/special_functions/sign.hpp>
#include <boost/math/policies/error_handling.hpp>
// Bessel function of the second kind of integer order
@@ -21,14 +23,14 @@
namespace boost { namespace math { namespace detail{
template <typename T, typename Policy>
T bessel_yn(int n, T x, const Policy& pol)
BOOST_MATH_GPU_ENABLED T bessel_yn(int n, T x, const Policy& pol)
{
BOOST_MATH_STD_USING
T value, factor, current, prev;
using namespace boost::math::tools;
static const char* function = "boost::math::bessel_yn<%1%>(%1%,%1%)";
constexpr auto function = "boost::math::bessel_yn<%1%>(%1%,%1%)";
if ((x == 0) && (n == 0))
{

View File

@@ -10,28 +10,29 @@
#pragma once
#endif
#include <type_traits>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/special_functions/round.hpp>
namespace boost { namespace math { namespace detail{
template <class T, class Policy>
inline int iconv_imp(T v, Policy const&, std::true_type const&)
BOOST_MATH_GPU_ENABLED inline int iconv_imp(T v, Policy const&, boost::math::true_type const&)
{
return static_cast<int>(v);
}
template <class T, class Policy>
inline int iconv_imp(T v, Policy const& pol, std::false_type const&)
BOOST_MATH_GPU_ENABLED inline int iconv_imp(T v, Policy const& pol, boost::math::false_type const&)
{
BOOST_MATH_STD_USING
return iround(v, pol);
}
template <class T, class Policy>
inline int iconv(T v, Policy const& pol)
BOOST_MATH_GPU_ENABLED inline int iconv(T v, Policy const& pol)
{
typedef typename std::is_convertible<T, int>::type tag_type;
typedef typename boost::math::is_convertible<T, int>::type tag_type;
return iconv_imp(v, pol, tag_type());
}

View File

@@ -10,19 +10,23 @@
#pragma once
#endif
#ifdef _MSC_VER
#pragma warning(push) // Temporary until lexical cast fixed.
#pragma warning(disable: 4127 4701)
#endif
#include <boost/math/tools/convert_from_string.hpp>
#ifdef _MSC_VER
#pragma warning(pop)
#endif
#include <cmath>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/array.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/tools/cxx03_warn.hpp>
#include <array>
#include <type_traits>
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
# ifdef _MSC_VER
# pragma warning(push) // Temporary until lexical cast fixed.
# pragma warning(disable: 4127 4701)
# endif
# include <boost/math/tools/convert_from_string.hpp>
# ifdef _MSC_VER
# pragma warning(pop)
# endif
#endif
#if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
//
@@ -46,13 +50,21 @@ struct max_factorial;
template <class T, bool = true>
struct unchecked_factorial_data;
#ifdef BOOST_MATH_HAS_NVRTC
// Need fwd decl
template <typename T>
BOOST_MATH_GPU_ENABLED inline T unchecked_factorial(unsigned i);
#endif
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
template <bool b>
struct unchecked_factorial_data<float, b>
{
#ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
static constexpr std::array<float, 35> factorials = { {
static constexpr boost::math::array<float, 35> factorials = { {
1.0F,
1.0F,
2.0F,
@@ -90,15 +102,15 @@ struct unchecked_factorial_data<float, b>
0.29523279903960414084761860964352e39F,
}};
#else
static const std::array<float, 35> factorials;
static const boost::math::array<float, 35> factorials;
#endif
};
template<bool b>
#ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
constexpr std::array<float, 35> unchecked_factorial_data<float, b>::factorials;
constexpr boost::math::array<float, 35> unchecked_factorial_data<float, b>::factorials;
#else
const std::array<float, 35> unchecked_factorial_data<float, b>::factorials = {{
const boost::math::array<float, 35> unchecked_factorial_data<float, b>::factorials = {{
1.0F,
1.0F,
2.0F,
@@ -204,7 +216,7 @@ template <bool b>
struct unchecked_factorial_data<double, b>
{
#ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
static constexpr std::array<double, 171> factorials = { {
static constexpr boost::math::array<double, 171> factorials = { {
1.0,
1.0,
2.0,
@@ -378,15 +390,15 @@ struct unchecked_factorial_data<double, b>
0.7257415615307998967396728211129263114717e307,
}};
#else
static const std::array<double, 171> factorials;
static const boost::math::array<double, 171> factorials;
#endif
};
template <bool b>
#ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
constexpr std::array<double, 171> unchecked_factorial_data<double, b>::factorials;
constexpr boost::math::array<double, 171> unchecked_factorial_data<double, b>::factorials;
#else
const std::array<double, 171> unchecked_factorial_data<double, b>::factorials = {{
const boost::math::array<double, 171> unchecked_factorial_data<double, b>::factorials = {{
1.0,
1.0,
2.0,
@@ -633,7 +645,7 @@ template <bool b>
struct unchecked_factorial_data<long double, b>
{
#ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
static constexpr std::array<long double, 171> factorials = { {
static constexpr boost::math::array<long double, 171> factorials = { {
1L,
1L,
2L,
@@ -807,15 +819,15 @@ struct unchecked_factorial_data<long double, b>
0.7257415615307998967396728211129263114717e307L,
}};
#else
static const std::array<long double, 171> factorials;
static const boost::math::array<long double, 171> factorials;
#endif
};
template <bool b>
#ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
constexpr std::array<long double, 171> unchecked_factorial_data<long double, b>::factorials;
constexpr boost::math::array<long double, 171> unchecked_factorial_data<long double, b>::factorials;
#else
const std::array<long double, 171> unchecked_factorial_data<long double, b>::factorials = {{
const boost::math::array<long double, 171> unchecked_factorial_data<long double, b>::factorials = {{
1L,
1L,
2L,
@@ -1008,7 +1020,7 @@ template <bool b>
struct unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>
{
#ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
static constexpr std::array<BOOST_MATH_FLOAT128_TYPE, 171> factorials = { {
static constexpr boost::math::array<BOOST_MATH_FLOAT128_TYPE, 171> factorials = { {
1,
1,
2,
@@ -1182,15 +1194,15 @@ struct unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>
0.7257415615307998967396728211129263114717e307Q,
} };
#else
static const std::array<BOOST_MATH_FLOAT128_TYPE, 171> factorials;
static const boost::math::array<BOOST_MATH_FLOAT128_TYPE, 171> factorials;
#endif
};
template <bool b>
#ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
constexpr std::array<BOOST_MATH_FLOAT128_TYPE, 171> unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>::factorials;
constexpr boost::math::array<BOOST_MATH_FLOAT128_TYPE, 171> unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>::factorials;
#else
const std::array<BOOST_MATH_FLOAT128_TYPE, 171> unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>::factorials = { {
const boost::math::array<BOOST_MATH_FLOAT128_TYPE, 171> unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>::factorials = { {
1,
1,
2,
@@ -1402,7 +1414,7 @@ const typename unchecked_factorial_initializer<T>::init unchecked_factorial_init
template <class T, int N>
inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, N>&)
inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, N>&)
{
//
// If you're foolish enough to instantiate factorial
@@ -1416,10 +1428,10 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, N
// unsigned int nfac = static_cast<unsigned int>(factorial<double>(n));
// See factorial documentation for more detail.
//
static_assert(!std::is_integral<T>::value && !std::numeric_limits<T>::is_integer, "Type T must not be an integral type");
static_assert(!boost::math::is_integral<T>::value && !boost::math::numeric_limits<T>::is_integer, "Type T must not be an integral type");
// We rely on C++11 thread safe initialization here:
static const std::array<T, 101> factorials = {{
static const boost::math::array<T, 101> factorials = {{
T(boost::math::tools::convert_from_string<T>("1")),
T(boost::math::tools::convert_from_string<T>("1")),
T(boost::math::tools::convert_from_string<T>("2")),
@@ -1527,7 +1539,7 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, N
}
template <class T>
inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 0>&)
inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, 0>&)
{
//
// If you're foolish enough to instantiate factorial
@@ -1541,7 +1553,7 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 0
// unsigned int nfac = static_cast<unsigned int>(factorial<double>(n));
// See factorial documentation for more detail.
//
static_assert(!std::is_integral<T>::value && !std::numeric_limits<T>::is_integer, "Type T must not be an integral type");
static_assert(!boost::math::is_integral<T>::value && !boost::math::numeric_limits<T>::is_integer, "Type T must not be an integral type");
static const char* const factorial_strings[] = {
"1",
@@ -1667,13 +1679,13 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 0
#endif // BOOST_MATH_HAS_GPU_SUPPORT
template <class T>
inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, std::numeric_limits<float>::digits>&)
BOOST_MATH_GPU_ENABLED inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, boost::math::numeric_limits<float>::digits>&)
{
return unchecked_factorial<float>(i);
}
template <class T>
inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, std::numeric_limits<double>::digits>&)
BOOST_MATH_GPU_ENABLED inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, boost::math::numeric_limits<double>::digits>&)
{
return unchecked_factorial<double>(i);
}
@@ -1682,14 +1694,14 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, s
#if DBL_MANT_DIG != LDBL_MANT_DIG
template <class T>
inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, LDBL_MANT_DIG>&)
inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, LDBL_MANT_DIG>&)
{
return unchecked_factorial<long double>(i);
}
#endif
#ifdef BOOST_MATH_USE_FLOAT128
template <class T>
inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 113>&)
inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, 113>&)
{
return unchecked_factorial<BOOST_MATH_FLOAT128_TYPE>(i);
}
@@ -1698,14 +1710,14 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 1
#endif // BOOST_MATH_HAS_GPU_SUPPORT
template <class T>
inline T unchecked_factorial(unsigned i)
BOOST_MATH_GPU_ENABLED inline T unchecked_factorial(unsigned i)
{
typedef typename boost::math::policies::precision<T, boost::math::policies::policy<> >::type tag_type;
return unchecked_factorial_imp<T>(i, tag_type());
}
#ifdef BOOST_MATH_USE_FLOAT128
#define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL : std::numeric_limits<T>::digits == 113 ? max_factorial<BOOST_MATH_FLOAT128_TYPE>::value
#define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL : boost::math::numeric_limits<T>::digits == 113 ? max_factorial<BOOST_MATH_FLOAT128_TYPE>::value
#else
#define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL
#endif
@@ -1714,10 +1726,10 @@ template <class T>
struct max_factorial
{
static constexpr unsigned value =
std::numeric_limits<T>::digits == std::numeric_limits<float>::digits ? max_factorial<float>::value
: std::numeric_limits<T>::digits == std::numeric_limits<double>::digits ? max_factorial<double>::value
boost::math::numeric_limits<T>::digits == boost::math::numeric_limits<float>::digits ? max_factorial<float>::value
: boost::math::numeric_limits<T>::digits == boost::math::numeric_limits<double>::digits ? max_factorial<double>::value
#ifndef BOOST_MATH_GPU_ENABLED
: std::numeric_limits<T>::digits == std::numeric_limits<long double>::digits ? max_factorial<long double>::value
: boost::math::numeric_limits<T>::digits == boost::math::numeric_limits<long double>::digits ? max_factorial<long double>::value
BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL
#endif
: 100;

View File

@@ -15,9 +15,6 @@
#ifndef BOOST_MATH_HAS_NVRTC
#include <cmath>
#include <cstdint>
#include <limits>
#include <boost/math/tools/series.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/tools/big_constant.hpp>
@@ -25,6 +22,9 @@
#include <boost/math/tools/rational.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/tools/assert.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/cstdint.hpp>
#if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
//
@@ -49,10 +49,10 @@ namespace detail
{
typedef T result_type;
expm1_series(T x)
BOOST_MATH_GPU_ENABLED expm1_series(T x)
: k(0), m_x(x), m_term(1) {}
T operator()()
BOOST_MATH_GPU_ENABLED T operator()()
{
++k;
m_term *= m_x;
@@ -60,7 +60,7 @@ namespace detail
return m_term;
}
int count()const
BOOST_MATH_GPU_ENABLED int count()const
{
return k;
}
@@ -78,26 +78,28 @@ struct expm1_initializer
{
struct init
{
init()
BOOST_MATH_GPU_ENABLED init()
{
do_init(tag());
}
template <int N>
static void do_init(const std::integral_constant<int, N>&){}
static void do_init(const std::integral_constant<int, 64>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, N>&){}
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
{
expm1(T(0.5));
}
static void do_init(const std::integral_constant<int, 113>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
{
expm1(T(0.5));
}
void force_instantiate()const{}
BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
};
static const init initializer;
static void force_instantiate()
BOOST_MATH_STATIC const init initializer;
BOOST_MATH_GPU_ENABLED static void force_instantiate()
{
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
initializer.force_instantiate();
#endif
}
};
@@ -110,7 +112,7 @@ const typename expm1_initializer<T, Policy, tag>::init expm1_initializer<T, Poli
// This version uses a Taylor series expansion for 0.5 > |x| > epsilon.
//
template <class T, class Policy>
T expm1_imp(T x, const std::integral_constant<int, 0>&, const Policy& pol)
T expm1_imp(T x, const boost::math::integral_constant<int, 0>&, const Policy& pol)
{
BOOST_MATH_STD_USING
@@ -132,7 +134,7 @@ T expm1_imp(T x, const std::integral_constant<int, 0>&, const Policy& pol)
if(a < tools::epsilon<T>())
return x;
detail::expm1_series<T> s(x);
std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
T result = tools::sum_series(s, policies::get_epsilon<T, Policy>(), max_iter);
@@ -141,7 +143,7 @@ T expm1_imp(T x, const std::integral_constant<int, 0>&, const Policy& pol)
}
template <class T, class P>
T expm1_imp(T x, const std::integral_constant<int, 53>&, const P& pol)
BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant<int, 53>&, const P& pol)
{
BOOST_MATH_STD_USING
@@ -159,16 +161,16 @@ T expm1_imp(T x, const std::integral_constant<int, 53>&, const P& pol)
if(a < tools::epsilon<T>())
return x;
static const float Y = 0.10281276702880859e1f;
static const T n[] = { static_cast<T>(-0.28127670288085937e-1), static_cast<T>(0.51278186299064534e0), static_cast<T>(-0.6310029069350198e-1), static_cast<T>(0.11638457975729296e-1), static_cast<T>(-0.52143390687521003e-3), static_cast<T>(0.21491399776965688e-4) };
static const T d[] = { 1, static_cast<T>(-0.45442309511354755e0), static_cast<T>(0.90850389570911714e-1), static_cast<T>(-0.10088963629815502e-1), static_cast<T>(0.63003407478692265e-3), static_cast<T>(-0.17976570003654402e-4) };
BOOST_MATH_STATIC const float Y = 0.10281276702880859e1f;
BOOST_MATH_STATIC const T n[] = { static_cast<T>(-0.28127670288085937e-1), static_cast<T>(0.51278186299064534e0), static_cast<T>(-0.6310029069350198e-1), static_cast<T>(0.11638457975729296e-1), static_cast<T>(-0.52143390687521003e-3), static_cast<T>(0.21491399776965688e-4) };
BOOST_MATH_STATIC const T d[] = { 1, static_cast<T>(-0.45442309511354755e0), static_cast<T>(0.90850389570911714e-1), static_cast<T>(-0.10088963629815502e-1), static_cast<T>(0.63003407478692265e-3), static_cast<T>(-0.17976570003654402e-4) };
T result = x * Y + x * tools::evaluate_polynomial(n, x) / tools::evaluate_polynomial(d, x);
return result;
}
template <class T, class P>
T expm1_imp(T x, const std::integral_constant<int, 64>&, const P& pol)
BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant<int, 64>&, const P& pol)
{
BOOST_MATH_STD_USING
@@ -186,8 +188,8 @@ T expm1_imp(T x, const std::integral_constant<int, 64>&, const P& pol)
if(a < tools::epsilon<T>())
return x;
static const float Y = 0.10281276702880859375e1f;
static const T n[] = {
BOOST_MATH_STATIC const float Y = 0.10281276702880859375e1f;
BOOST_MATH_STATIC const T n[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, -0.281276702880859375e-1),
BOOST_MATH_BIG_CONSTANT(T, 64, 0.512980290285154286358e0),
BOOST_MATH_BIG_CONSTANT(T, 64, -0.667758794592881019644e-1),
@@ -196,7 +198,7 @@ T expm1_imp(T x, const std::integral_constant<int, 64>&, const P& pol)
BOOST_MATH_BIG_CONSTANT(T, 64, 0.447441185192951335042e-4),
BOOST_MATH_BIG_CONSTANT(T, 64, -0.714539134024984593011e-6)
};
static const T d[] = {
BOOST_MATH_STATIC const T d[] = {
BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
BOOST_MATH_BIG_CONSTANT(T, 64, -0.461477618025562520389e0),
BOOST_MATH_BIG_CONSTANT(T, 64, 0.961237488025708540713e-1),
@@ -211,7 +213,7 @@ T expm1_imp(T x, const std::integral_constant<int, 64>&, const P& pol)
}
template <class T, class P>
T expm1_imp(T x, const std::integral_constant<int, 113>&, const P& pol)
BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant<int, 113>&, const P& pol)
{
BOOST_MATH_STD_USING
@@ -263,7 +265,7 @@ T expm1_imp(T x, const std::integral_constant<int, 113>&, const P& pol)
} // namespace detail
template <class T, class Policy>
inline typename tools::promote_args<T>::type expm1(T x, const Policy& /* pol */)
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type expm1(T x, const Policy& /* pol */)
{
typedef typename tools::promote_args<T>::type result_type;
typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -275,7 +277,7 @@ inline typename tools::promote_args<T>::type expm1(T x, const Policy& /* pol */)
policies::discrete_quantile<>,
policies::assert_undefined<> >::type forwarding_policy;
typedef std::integral_constant<int,
typedef boost::math::integral_constant<int,
precision_type::value <= 0 ? 0 :
precision_type::value <= 53 ? 53 :
precision_type::value <= 64 ? 64 :

View File

@@ -10,10 +10,14 @@
#pragma once
#endif
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/special_functions/gamma.hpp>
#include <boost/math/special_functions/detail/unchecked_factorial.hpp>
#include <array>
#include <boost/math/special_functions/math_fwd.hpp>
#ifdef _MSC_VER
#pragma warning(push) // Temporary until lexical cast fixed.
#pragma warning(disable: 4127 4701)
@@ -21,16 +25,14 @@
#ifdef _MSC_VER
#pragma warning(pop)
#endif
#include <type_traits>
#include <cmath>
namespace boost { namespace math
{
template <class T, class Policy>
inline T factorial(unsigned i, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T factorial(unsigned i, const Policy& pol)
{
static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
// factorial<unsigned int>(n) is not implemented
// because it would overflow integral type T for too small n
// to be useful. Use instead a floating-point type,
@@ -49,7 +51,7 @@ inline T factorial(unsigned i, const Policy& pol)
}
template <class T>
inline T factorial(unsigned i)
BOOST_MATH_GPU_ENABLED inline T factorial(unsigned i)
{
return factorial<T>(i, policies::policy<>());
}
@@ -72,9 +74,9 @@ inline double factorial<double>(unsigned i)
}
*/
template <class T, class Policy>
T double_factorial(unsigned i, const Policy& pol)
BOOST_MATH_GPU_ENABLED T double_factorial(unsigned i, const Policy& pol)
{
static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
BOOST_MATH_STD_USING // ADL lookup of std names
if(i & 1)
{
@@ -107,17 +109,20 @@ T double_factorial(unsigned i, const Policy& pol)
}
template <class T>
inline T double_factorial(unsigned i)
BOOST_MATH_GPU_ENABLED inline T double_factorial(unsigned i)
{
return double_factorial<T>(i, policies::policy<>());
}
// TODO(mborland): We do not currently have support for tgamma_delta_ratio
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
namespace detail{
template <class T, class Policy>
T rising_factorial_imp(T x, int n, const Policy& pol)
{
static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
if(x < 0)
{
//
@@ -165,7 +170,7 @@ T rising_factorial_imp(T x, int n, const Policy& pol)
template <class T, class Policy>
inline T falling_factorial_imp(T x, unsigned n, const Policy& pol)
{
static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
BOOST_MATH_STD_USING // ADL of std names
if(x == 0)
return 0;
@@ -262,6 +267,8 @@ inline typename tools::promote_args<RT>::type
static_cast<result_type>(x), n, pol);
}
#endif // BOOST_MATH_HAS_GPU_SUPPORT
} // namespace math
} // namespace boost

View File

@@ -2287,6 +2287,7 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
#else
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/expm1.hpp>
namespace boost {
namespace math {
@@ -2295,7 +2296,7 @@ inline BOOST_MATH_GPU_ENABLED float tgamma(float x) { return ::tgammaf(x); }
inline BOOST_MATH_GPU_ENABLED double tgamma(double x) { return ::tgamma(x); }
template <typename T, typename Policy>
inline BOOST_MATH_GPU_ENABLED T tgamma(T x, const Policy&)
BOOST_MATH_GPU_ENABLED T tgamma(T x, const Policy&)
{
return boost::math::tgamma(x);
}
@@ -2304,11 +2305,49 @@ inline BOOST_MATH_GPU_ENABLED float lgamma(float x) { return ::lgammaf(x); }
inline BOOST_MATH_GPU_ENABLED double lgamma(double x) { return ::lgamma(x); }
template <typename T, typename Policy>
inline BOOST_MATH_GPU_ENABLED T lgamma(T x, const Policy&)
BOOST_MATH_GPU_ENABLED T lgamma(T x, const Policy&)
{
return boost::math::lgamma(x);
}
template <typename T, typename Policy>
BOOST_MATH_GPU_ENABLED T lgamma(T x, int* sign, const Policy&)
{
auto res = boost::math::lgamma(x);
if (sign != nullptr)
{
if (res < 0)
{
*sign = -1;
}
else
{
*sign = 1;
}
}
return res;
}
template <typename T>
BOOST_MATH_GPU_ENABLED T tgamma1pm1(T z)
{
using namespace boost::math;
if (fabs(z) < T(0.55))
{
return expm1(lgamma(z));
}
return expm1(lgamma(1 + z));
}
template <typename T, typename Policy>
BOOST_MATH_GPU_ENABLED T tgamma1pm1(T x, const Policy&)
{
return tgamma1pm1(x);
}
} // namespace math
} // namespace boost

View File

@@ -12,20 +12,20 @@
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <algorithm> // for swap
#include <cmath>
namespace boost{ namespace math{ namespace detail{
template <class T, class Policy>
T hypot_imp(T x, T y, const Policy& pol)
BOOST_MATH_GPU_ENABLED T hypot_imp(T x, T y, const Policy& pol)
{
//
// Normalize x and y, so that both are positive and x >= y:
//
using std::fabs; using std::sqrt; // ADL of std names
BOOST_MATH_STD_USING
x = fabs(x);
y = fabs(y);
@@ -35,16 +35,16 @@ T hypot_imp(T x, T y, const Policy& pol)
#pragma warning(disable: 4127)
#endif
// special case, see C99 Annex F:
if(std::numeric_limits<T>::has_infinity
&& ((x == std::numeric_limits<T>::infinity())
|| (y == std::numeric_limits<T>::infinity())))
if(boost::math::numeric_limits<T>::has_infinity
&& ((x == boost::math::numeric_limits<T>::infinity())
|| (y == boost::math::numeric_limits<T>::infinity())))
return policies::raise_overflow_error<T>("boost::math::hypot<%1%>(%1%,%1%)", nullptr, pol);
#ifdef _MSC_VER
#pragma warning(pop)
#endif
if(y > x)
(std::swap)(x, y);
BOOST_MATH_GPU_SAFE_SWAP(x, y);
if(x * tools::epsilon<T>() >= y)
return x;
@@ -56,7 +56,7 @@ T hypot_imp(T x, T y, const Policy& pol)
}
template <class T1, class T2>
inline typename tools::promote_args<T1, T2>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2>::type
hypot(T1 x, T2 y)
{
typedef typename tools::promote_args<T1, T2>::type result_type;
@@ -65,7 +65,7 @@ inline typename tools::promote_args<T1, T2>::type
}
template <class T1, class T2, class Policy>
inline typename tools::promote_args<T1, T2>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2>::type
hypot(T1 x, T2 y, const Policy& pol)
{
typedef typename tools::promote_args<T1, T2>::type result_type;

View File

@@ -24,12 +24,16 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#ifndef BOOST_MATH_HAS_NVRTC
#include <vector>
#include <complex>
#include <type_traits>
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/detail/round_fwd.hpp>
#include <boost/math/tools/promotion.hpp> // for argument promotion.
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/policies/policy.hpp>
#define BOOST_NO_MACRO_EXPAND /**/
@@ -420,15 +424,15 @@ namespace boost
template <class RT>
struct max_factorial;
template <class RT>
RT factorial(unsigned int);
BOOST_MATH_GPU_ENABLED RT factorial(unsigned int);
template <class RT, class Policy>
RT factorial(unsigned int, const Policy& pol);
BOOST_MATH_GPU_ENABLED RT factorial(unsigned int, const Policy& pol);
template <class RT>
BOOST_MATH_GPU_ENABLED RT unchecked_factorial(unsigned int BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(RT));
template <class RT>
RT double_factorial(unsigned i);
BOOST_MATH_GPU_ENABLED RT double_factorial(unsigned i);
template <class RT, class Policy>
RT double_factorial(unsigned i, const Policy& pol);
BOOST_MATH_GPU_ENABLED RT double_factorial(unsigned i, const Policy& pol);
template <class RT>
tools::promote_args_t<RT> falling_factorial(RT x, unsigned n);
@@ -554,11 +558,11 @@ namespace boost
// Hypotenuse function sqrt(x ^ 2 + y ^ 2).
template <class T1, class T2>
tools::promote_args_t<T1, T2>
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
hypot(T1 x, T2 y);
template <class T1, class T2, class Policy>
tools::promote_args_t<T1, T2>
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
hypot(T1 x, T2 y, const Policy&);
// cbrt - cube root.
@@ -607,10 +611,10 @@ namespace boost
// sinus cardinals:
template <class T>
tools::promote_args_t<T> sinc_pi(T x);
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> sinc_pi(T x);
template <class T, class Policy>
tools::promote_args_t<T> sinc_pi(T x, const Policy&);
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> sinc_pi(T x, const Policy&);
template <class T>
tools::promote_args_t<T> sinhc_pi(T x);
@@ -639,36 +643,36 @@ namespace boost
namespace detail{
typedef std::integral_constant<int, 0> bessel_no_int_tag; // No integer optimisation possible.
typedef std::integral_constant<int, 1> bessel_maybe_int_tag; // Maybe integer optimisation.
typedef std::integral_constant<int, 2> bessel_int_tag; // Definite integer optimisation.
typedef boost::math::integral_constant<int, 0> bessel_no_int_tag; // No integer optimisation possible.
typedef boost::math::integral_constant<int, 1> bessel_maybe_int_tag; // Maybe integer optimisation.
typedef boost::math::integral_constant<int, 2> bessel_int_tag; // Definite integer optimisation.
template <class T1, class T2, class Policy>
struct bessel_traits
{
using result_type = typename std::conditional<
std::is_integral<T1>::value,
using result_type = typename boost::math::conditional<
boost::math::is_integral<T1>::value,
typename tools::promote_args<T2>::type,
tools::promote_args_t<T1, T2>
>::type;
typedef typename policies::precision<result_type, Policy>::type precision_type;
using optimisation_tag = typename std::conditional<
using optimisation_tag = typename boost::math::conditional<
(precision_type::value <= 0 || precision_type::value > 64),
bessel_no_int_tag,
typename std::conditional<
std::is_integral<T1>::value,
typename boost::math::conditional<
boost::math::is_integral<T1>::value,
bessel_int_tag,
bessel_maybe_int_tag
>::type
>::type;
using optimisation_tag128 = typename std::conditional<
using optimisation_tag128 = typename boost::math::conditional<
(precision_type::value <= 0 || precision_type::value > 113),
bessel_no_int_tag,
typename std::conditional<
std::is_integral<T1>::value,
typename boost::math::conditional<
boost::math::is_integral<T1>::value,
bessel_int_tag,
bessel_maybe_int_tag
>::type
@@ -678,98 +682,98 @@ namespace boost
// Bessel functions:
template <class T1, class T2, class Policy>
typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j(T1 v, T2 x, const Policy& pol);
template <class T1, class T2, class Policy>
typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j_prime(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j_prime(T1 v, T2 x, const Policy& pol);
template <class T1, class T2>
typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j(T1 v, T2 x);
template <class T1, class T2>
typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j_prime(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j_prime(T1 v, T2 x);
template <class T, class Policy>
typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel(unsigned v, T x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel(unsigned v, T x, const Policy& pol);
template <class T, class Policy>
typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel_prime(unsigned v, T x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel_prime(unsigned v, T x, const Policy& pol);
template <class T>
typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel(unsigned v, T x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel(unsigned v, T x);
template <class T>
typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel_prime(unsigned v, T x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel_prime(unsigned v, T x);
template <class T1, class T2, class Policy>
typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i(T1 v, T2 x, const Policy& pol);
template <class T1, class T2, class Policy>
typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i_prime(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i_prime(T1 v, T2 x, const Policy& pol);
template <class T1, class T2>
typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i(T1 v, T2 x);
template <class T1, class T2>
typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i_prime(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i_prime(T1 v, T2 x);
template <class T1, class T2, class Policy>
typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k(T1 v, T2 x, const Policy& pol);
template <class T1, class T2, class Policy>
typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k_prime(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k_prime(T1 v, T2 x, const Policy& pol);
template <class T1, class T2>
typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k(T1 v, T2 x);
template <class T1, class T2>
typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k_prime(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k_prime(T1 v, T2 x);
template <class T1, class T2, class Policy>
typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann(T1 v, T2 x, const Policy& pol);
template <class T1, class T2, class Policy>
typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann_prime(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann_prime(T1 v, T2 x, const Policy& pol);
template <class T1, class T2>
typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann(T1 v, T2 x);
template <class T1, class T2>
typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann_prime(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann_prime(T1 v, T2 x);
template <class T, class Policy>
typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann(unsigned v, T x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann(unsigned v, T x, const Policy& pol);
template <class T, class Policy>
typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann_prime(unsigned v, T x, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann_prime(unsigned v, T x, const Policy& pol);
template <class T>
typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann(unsigned v, T x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann(unsigned v, T x);
template <class T>
typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann_prime(unsigned v, T x);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann_prime(unsigned v, T x);
template <class T, class Policy>
typename detail::bessel_traits<T, T, Policy>::result_type cyl_bessel_j_zero(T v, int m, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type cyl_bessel_j_zero(T v, int m, const Policy& pol);
template <class T>
typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_bessel_j_zero(T v, int m);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_bessel_j_zero(T v, int m);
template <class T, class OutputIterator>
OutputIterator cyl_bessel_j_zero(T v,
BOOST_MATH_GPU_ENABLED OutputIterator cyl_bessel_j_zero(T v,
int start_index,
unsigned number_of_zeros,
OutputIterator out_it);
template <class T, class OutputIterator, class Policy>
OutputIterator cyl_bessel_j_zero(T v,
BOOST_MATH_GPU_ENABLED OutputIterator cyl_bessel_j_zero(T v,
int start_index,
unsigned number_of_zeros,
OutputIterator out_it,
const Policy&);
template <class T, class Policy>
typename detail::bessel_traits<T, T, Policy>::result_type cyl_neumann_zero(T v, int m, const Policy& pol);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type cyl_neumann_zero(T v, int m, const Policy& pol);
template <class T>
typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_neumann_zero(T v, int m);
BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_neumann_zero(T v, int m);
template <class T, class OutputIterator>
OutputIterator cyl_neumann_zero(T v,
BOOST_MATH_GPU_ENABLED OutputIterator cyl_neumann_zero(T v,
int start_index,
unsigned number_of_zeros,
OutputIterator out_it);
template <class T, class OutputIterator, class Policy>
OutputIterator cyl_neumann_zero(T v,
BOOST_MATH_GPU_ENABLED OutputIterator cyl_neumann_zero(T v,
int start_index,
unsigned number_of_zeros,
OutputIterator out_it,
@@ -1400,10 +1404,10 @@ namespace boost
\
using boost::math::max_factorial;\
template <class RT>\
inline RT factorial(unsigned int i) { return boost::math::factorial<RT>(i, Policy()); }\
BOOST_MATH_GPU_ENABLED inline RT factorial(unsigned int i) { return boost::math::factorial<RT>(i, Policy()); }\
using boost::math::unchecked_factorial;\
template <class RT>\
inline RT double_factorial(unsigned i){ return boost::math::double_factorial<RT>(i, Policy()); }\
BOOST_MATH_GPU_ENABLED inline RT double_factorial(unsigned i){ return boost::math::double_factorial<RT>(i, Policy()); }\
template <class RT>\
inline boost::math::tools::promote_args_t<RT> falling_factorial(RT x, unsigned n){ return boost::math::falling_factorial(x, n, Policy()); }\
template <class RT>\
@@ -1465,7 +1469,7 @@ namespace boost
\
template <class T1, class T2>\
inline boost::math::tools::promote_args_t<T1, T2> \
hypot(T1 x, T2 y){ return boost::math::hypot(x, y, Policy()); }\
BOOST_MATH_GPU_ENABLED hypot(T1 x, T2 y){ return boost::math::hypot(x, y, Policy()); }\
\
template <class RT>\
inline boost::math::tools::promote_args_t<RT> cbrt(RT z){ return boost::math::cbrt(z, Policy()); }\
@@ -1487,7 +1491,7 @@ namespace boost
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> sqrt1pm1(const T& val){ return boost::math::sqrt1pm1(val, Policy()); }\
\
template <class T>\
inline boost::math::tools::promote_args_t<T> sinc_pi(T x){ return boost::math::sinc_pi(x, Policy()); }\
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> sinc_pi(T x){ return boost::math::sinc_pi(x, Policy()); }\
\
template <class T>\
inline boost::math::tools::promote_args_t<T> sinhc_pi(T x){ return boost::math::sinhc_pi(x, Policy()); }\
@@ -1817,6 +1821,6 @@ template <class OutputIterator, class T>\
#endif // BOOST_MATH_HAS_NVRTC
#endif // BOOST_MATH_SPECIAL_MATH_FWD_HPP

View File

@@ -10,6 +10,11 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
// TODO(mborland): Need to remove recurrsion from these algos
#ifndef BOOST_MATH_HAS_NVRTC
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
@@ -920,4 +925,6 @@ inline typename tools::promote_args<T>::type float_advance(const T& val, int dis
}} // boost math namespaces
#endif
#endif // BOOST_MATH_SPECIAL_NEXT_HPP

View File

@@ -273,6 +273,30 @@ BOOST_MATH_GPU_ENABLED float round(float x, const Policy&)
return ::roundf(x);
}
template <typename T>
BOOST_MATH_GPU_ENABLED int iround(T x)
{
return static_cast<int>(::lround(x));
}
template <>
BOOST_MATH_GPU_ENABLED int iround(float x)
{
return static_cast<int>(::lroundf(x));
}
template <typename T, typename Policy>
BOOST_MATH_GPU_ENABLED int iround(T x, const Policy&)
{
return static_cast<int>(::lround(x));
}
template <typename Policy>
BOOST_MATH_GPU_ENABLED int iround(float x, const Policy&)
{
return static_cast<int>(::lroundf(x));
}
template <typename T>
BOOST_MATH_GPU_ENABLED long lround(T x)
{

View File

@@ -17,13 +17,13 @@
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/tools/promotion.hpp>
#include <boost/math/policies/policy.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <limits>
#include <string>
#include <stdexcept>
#include <cmath>
#ifndef BOOST_MATH_HAS_NVRTC
#include <boost/math/special_functions/math_fwd.hpp>
#endif
// These are the the "Sinus Cardinal" functions.
@@ -36,7 +36,7 @@ namespace boost
// This is the "Sinus Cardinal" of index Pi.
template<typename T>
inline T sinc_pi_imp(const T x)
BOOST_MATH_GPU_ENABLED inline T sinc_pi_imp(const T x)
{
BOOST_MATH_STD_USING
@@ -44,7 +44,7 @@ namespace boost
{
return 0;
}
else if (abs(x) >= 3.3 * tools::forth_root_epsilon<T>())
else if (abs(x) >= T(3.3) * tools::forth_root_epsilon<T>())
{
return(sin(x)/x);
}
@@ -58,24 +58,23 @@ namespace boost
} // namespace detail
template <class T>
inline typename tools::promote_args<T>::type sinc_pi(T x)
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type sinc_pi(T x)
{
typedef typename tools::promote_args<T>::type result_type;
return detail::sinc_pi_imp(static_cast<result_type>(x));
}
template <class T, class Policy>
inline typename tools::promote_args<T>::type sinc_pi(T x, const Policy&)
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type sinc_pi(T x, const Policy&)
{
typedef typename tools::promote_args<T>::type result_type;
return detail::sinc_pi_imp(static_cast<result_type>(x));
}
template<typename T, template<typename> class U>
inline U<T> sinc_pi(const U<T> x)
BOOST_MATH_GPU_ENABLED inline U<T> sinc_pi(const U<T> x)
{
BOOST_MATH_STD_USING
using ::std::numeric_limits;
T const taylor_0_bound = tools::epsilon<T>();
T const taylor_2_bound = tools::root_epsilon<T>();
@@ -88,11 +87,11 @@ namespace boost
else
{
// approximation by taylor series in x at 0 up to order 0
#ifdef __MWERKS__
#ifdef __MWERKS__
U<T> result = static_cast<U<T> >(1);
#else
#else
U<T> result = U<T>(1);
#endif
#endif
if (abs(x) >= taylor_0_bound)
{
@@ -113,7 +112,7 @@ namespace boost
}
template<typename T, template<typename> class U, class Policy>
inline U<T> sinc_pi(const U<T> x, const Policy&)
BOOST_MATH_GPU_ENABLED inline U<T> sinc_pi(const U<T> x, const Policy&)
{
return sinc_pi(x);
}

View File

@@ -14,6 +14,7 @@
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/math/special_functions/next.hpp>
#include <boost/math/tools/precision.hpp>
namespace boost{ namespace math{ namespace detail{

View File

@@ -0,0 +1,41 @@
// Copyright (c) 2024 Matt Borland
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// Regular use of std::array functions can not be used on
// GPU platforms like CUDA since they are missing the __device__ marker
// Alias as needed to get correct support
#ifndef BOOST_MATH_TOOLS_ARRAY_HPP
#define BOOST_MATH_TOOLS_ARRAY_HPP
#include <boost/math/tools/config.hpp>
#ifdef BOOST_MATH_ENABLE_CUDA
#include <cuda/std/array>
namespace boost {
namespace math {
using cuda::std::array;
} // namespace math
} // namespace boost
#else
#include <array>
namespace boost {
namespace math {
using std::array;
} // namespace math
} // namespace boost
#endif // BOOST_MATH_ENABLE_CUDA
#endif // BOOST_MATH_TOOLS_ARRAY_HPP

View File

@@ -676,6 +676,7 @@ namespace boost{ namespace math{
#include <cuda/std/type_traits>
#include <cuda/std/utility>
#include <cuda/std/cstdint>
#include <cuda/std/array>
# define BOOST_MATH_CUDA_ENABLED __host__ __device__
# define BOOST_MATH_HAS_GPU_SUPPORT
@@ -733,7 +734,7 @@ BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b;
template <class T>
BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_min(const T& a, const T& b) { return a < b ? a : b; }
template <class T>
BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { return a > b ? a : b; }
BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return a > b ? a : b; }
#define BOOST_MATH_GPU_SAFE_SWAP(a, b) gpu_safe_swap(a, b)
#define BOOST_MATH_GPU_SAFE_MIN(a, b) gpu_safe_min(a, b)
@@ -794,10 +795,13 @@ BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { retur
#define BOOST_MATH_NOEXCEPT(T) noexcept(boost::math::is_floating_point_v<T>)
#define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)
#define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)
#define BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(T)
#define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast<T>(V)
#define BOOST_MATH_FORCEINLINE __forceinline__
#define BOOST_MATH_STD_USING
#define BOOST_MATH_IF_CONSTEXPR if constexpr
#define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point<T>::value)
#define BOOST_MATH_CONSTEXPR_TABLE_FUNCTION constexpr
// This should be defined to nothing but since it is not specifically a math macro
// we need to undef before proceeding
@@ -829,6 +833,9 @@ BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b;
# define BOOST_MATH_INLINE_CONSTEXPR constexpr
#endif
#define BOOST_MATH_INSTRUMENT_VARIABLE(x)
#define BOOST_MATH_INSTRUMENT_CODE(x)
#endif // NVRTC
#endif // BOOST_MATH_TOOLS_CONFIG_HPP

View File

@@ -1,4 +1,5 @@
// (C) Copyright John Maddock 2006.
// (C) Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -11,22 +12,19 @@
#endif
#include <boost/math/tools/config.hpp>
#ifndef BOOST_MATH_HAS_NVRTC // Disabled for now
#include <boost/math/tools/complex.hpp> // test for multiprecision types in complex Newton
#include <utility>
#include <cmath>
#include <tuple>
#include <cstdint>
#include <boost/math/tools/cxx03_warn.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/cstdint.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/tuple.hpp>
#include <boost/math/special_functions/sign.hpp>
#include <boost/math/policies/policy.hpp>
#include <boost/math/policies/error_handling.hpp>
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
#include <boost/math/special_functions/next.hpp>
#include <boost/math/tools/toms748_solve.hpp>
#include <boost/math/policies/error_handling.hpp>
#endif
namespace boost {
namespace math {
@@ -37,11 +35,11 @@ namespace detail {
namespace dummy {
template<int n, class T>
typename T::value_type get(const T&) BOOST_MATH_NOEXCEPT(T);
BOOST_MATH_GPU_ENABLED typename T::value_type get(const T&) BOOST_MATH_NOEXCEPT(T);
}
template <class Tuple, class T>
void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T)
BOOST_MATH_GPU_ENABLED void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T)
{
using dummy::get;
// Use ADL to find the right overload for get:
@@ -49,7 +47,7 @@ void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T)
b = get<1>(t);
}
template <class Tuple, class T>
void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T)
BOOST_MATH_GPU_ENABLED void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T)
{
using dummy::get;
// Use ADL to find the right overload for get:
@@ -59,7 +57,7 @@ void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T)
}
template <class Tuple, class T>
inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T)
BOOST_MATH_GPU_ENABLED inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T)
{
using dummy::get;
// Rely on ADL to find the correct overload of get:
@@ -67,26 +65,30 @@ inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T)
}
template <class T, class U, class V>
inline void unpack_tuple(const std::pair<T, U>& p, V& a, V& b) BOOST_MATH_NOEXCEPT(T)
BOOST_MATH_GPU_ENABLED inline void unpack_tuple(const boost::math::pair<T, U>& p, V& a, V& b) BOOST_MATH_NOEXCEPT(T)
{
a = p.first;
b = p.second;
}
template <class T, class U, class V>
inline void unpack_0(const std::pair<T, U>& p, V& a) BOOST_MATH_NOEXCEPT(T)
BOOST_MATH_GPU_ENABLED inline void unpack_0(const boost::math::pair<T, U>& p, V& a) BOOST_MATH_NOEXCEPT(T)
{
a = p.first;
}
template <class F, class T>
void handle_zero_derivative(F f,
BOOST_MATH_GPU_ENABLED void handle_zero_derivative(F f,
T& last_f0,
const T& f0,
T& delta,
T& result,
T& guess,
const T& min,
const T& max) noexcept(BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
const T& max) noexcept(BOOST_MATH_IS_FLOAT(T)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<F>()(std::declval<T>()))
#endif
)
{
if (last_f0 == 0)
{
@@ -132,25 +134,29 @@ void handle_zero_derivative(F f,
} // namespace
template <class F, class T, class Tol, class Policy>
std::pair<T, T> bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter, const Policy& pol) noexcept(policies::is_noexcept_error_policy<Policy>::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
BOOST_MATH_GPU_ENABLED boost::math::pair<T, T> bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter, const Policy& pol) noexcept(policies::is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<F>()(std::declval<T>()))
#endif
)
{
T fmin = f(min);
T fmax = f(max);
if (fmin == 0)
{
max_iter = 2;
return std::make_pair(min, min);
return boost::math::make_pair(min, min);
}
if (fmax == 0)
{
max_iter = 2;
return std::make_pair(max, max);
return boost::math::make_pair(max, max);
}
//
// Error checking:
//
static const char* function = "boost::math::tools::bisect<%1%>";
constexpr auto function = "boost::math::tools::bisect<%1%>";
if (min >= max)
{
return boost::math::detail::pair_from_single(policies::raise_evaluation_error(function,
@@ -200,29 +206,41 @@ std::pair<T, T> bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter, con
std::cout << "Bisection required " << max_iter << " iterations.\n";
#endif
return std::make_pair(min, max);
return boost::math::make_pair(min, max);
}
template <class F, class T, class Tol>
inline std::pair<T, T> bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
BOOST_MATH_GPU_ENABLED inline boost::math::pair<T, T> bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value && BOOST_MATH_IS_FLOAT(T)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<F>()(std::declval<T>()))
#endif
)
{
return bisect(f, min, max, tol, max_iter, policies::policy<>());
}
template <class F, class T, class Tol>
inline std::pair<T, T> bisect(F f, T min, T max, Tol tol) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
BOOST_MATH_GPU_ENABLED inline boost::math::pair<T, T> bisect(F f, T min, T max, Tol tol) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value && BOOST_MATH_IS_FLOAT(T)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<F>()(std::declval<T>()))
#endif
)
{
std::uintmax_t m = (std::numeric_limits<std::uintmax_t>::max)();
boost::math::uintmax_t m = (boost::math::numeric_limits<boost::math::uintmax_t>::max)();
return bisect(f, min, max, tol, m, policies::policy<>());
}
template <class F, class T>
T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
BOOST_MATH_GPU_ENABLED T newton_raphson_iterate(F f, T guess, T min, T max, int digits, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value && BOOST_MATH_IS_FLOAT(T)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<F>()(std::declval<T>()))
#endif
)
{
BOOST_MATH_STD_USING
static const char* function = "boost::math::tools::newton_raphson_iterate<%1%>";
constexpr auto function = "boost::math::tools::newton_raphson_iterate<%1%>";
if (min > max)
{
return policies::raise_evaluation_error(function, "Range arguments in wrong order in boost::math::tools::newton_raphson_iterate(first arg=%1%)", min, boost::math::policies::policy<>());
@@ -249,7 +267,7 @@ T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t&
T max_range_f = 0;
T min_range_f = 0;
std::uintmax_t count(max_iter);
boost::math::uintmax_t count(max_iter);
#ifdef BOOST_MATH_INSTRUMENT
std::cout << "Newton_raphson_iterate, guess = " << guess << ", min = " << min << ", max = " << max
@@ -336,12 +354,22 @@ T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t&
}
template <class F, class T>
inline T newton_raphson_iterate(F f, T guess, T min, T max, int digits) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
BOOST_MATH_GPU_ENABLED inline T newton_raphson_iterate(F f, T guess, T min, T max, int digits) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value && BOOST_MATH_IS_FLOAT(T)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<F>()(std::declval<T>()))
#endif
)
{
std::uintmax_t m = (std::numeric_limits<std::uintmax_t>::max)();
boost::math::uintmax_t m = (boost::math::numeric_limits<boost::math::uintmax_t>::max)();
return newton_raphson_iterate(f, guess, min, max, digits, m);
}
// TODO(mborland): Disabled for now
// Recursion needs to be removed, but there is no demand at this time
#ifdef BOOST_MATH_HAS_NVRTC
}}} // Namespaces
#else
namespace detail {
struct halley_step

View File

@@ -10,10 +10,10 @@
#pragma once
#endif
#include <cmath>
#include <cstdint>
#include <limits>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/cstdint.hpp>
namespace boost{ namespace math{ namespace tools{
@@ -21,13 +21,17 @@ namespace boost{ namespace math{ namespace tools{
// Simple series summation come first:
//
template <class Functor, class U, class V>
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms, const V& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms, const V& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<Functor>()())
#endif
)
{
BOOST_MATH_STD_USING
typedef typename Functor::result_type result_type;
std::uintmax_t counter = max_terms;
boost::math::uintmax_t counter = max_terms;
result_type result = init_value;
result_type next_term;
@@ -44,14 +48,22 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor&
}
template <class Functor, class U>
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<Functor>()())
#endif
)
{
typename Functor::result_type init_value = 0;
return sum_series(func, factor, max_terms, init_value);
}
template <class Functor, class U>
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, std::uintmax_t& max_terms, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<Functor>()())
#endif
)
{
BOOST_MATH_STD_USING
typedef typename Functor::result_type result_type;
@@ -60,17 +72,25 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor&
}
template <class Functor>
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<Functor>()())
#endif
)
{
BOOST_MATH_STD_USING
typedef typename Functor::result_type result_type;
std::uintmax_t iters = (std::numeric_limits<std::uintmax_t>::max)();
boost::math::uintmax_t iters = (boost::math::numeric_limits<boost::math::uintmax_t>::max)();
result_type init_val = 0;
return sum_series(func, bits, iters, init_val);
}
template <class Functor>
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<Functor>()())
#endif
)
{
BOOST_MATH_STD_USING
typedef typename Functor::result_type result_type;
@@ -79,23 +99,31 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor&
}
template <class Functor, class U>
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<Functor>()())
#endif
)
{
BOOST_MATH_STD_USING
std::uintmax_t iters = (std::numeric_limits<std::uintmax_t>::max)();
boost::math::uintmax_t iters = (boost::math::numeric_limits<boost::math::uintmax_t>::max)();
return sum_series(func, bits, iters, init_value);
}
//
// Checked summation:
//
template <class Functor, class U, class V>
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms, const V& init_value, V& norm) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms, const V& init_value, V& norm) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<Functor>()())
#endif
)
{
BOOST_MATH_STD_USING
typedef typename Functor::result_type result_type;
std::uintmax_t counter = max_terms;
boost::math::uintmax_t counter = max_terms;
result_type result = init_value;
result_type next_term;
@@ -125,7 +153,11 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(F
// in any case the result is still much better than a naive summation.
//
template <class Functor>
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<Functor>()())
#endif
)
{
BOOST_MATH_STD_USING
@@ -148,13 +180,17 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Fun
}
template <class Functor>
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
&& noexcept(std::declval<Functor>()())
#endif
)
{
BOOST_MATH_STD_USING
typedef typename Functor::result_type result_type;
std::uintmax_t counter = max_terms;
boost::math::uintmax_t counter = max_terms;
result_type factor = ldexp(result_type(1), bits);
result_type result = func();

View File

@@ -17,12 +17,14 @@ run test_arcsine_pdf_float.cu ;
run test_arcsine_quan_double.cu ;
run test_arcsine_quan_float.cu ;
run test_arcsine_range_support_double.cu ;
run test_bernoulli_cdf_double.cu ;
run test_bernoulli_cdf_float.cu ;
run test_bernoulli_pdf_double.cu ;
run test_bernoulli_pdf_float.cu ;
run test_bernoulli_range_support_double.cu ;
run test_bernoulli_range_support_float.cu ;
run test_cauchy_cdf_double.cu ;
run test_cauchy_cdf_float.cu ;
run test_cauchy_pdf_double.cu ;
@@ -31,6 +33,7 @@ run test_cauchy_quan_double.cu ;
run test_cauchy_quan_float.cu ;
run test_cauchy_range_support_double.cu ;
run test_cauchy_range_support_float.cu ;
run test_exponential_cdf_double.cu ;
run test_exponential_cdf_float.cu ;
run test_exponential_pdf_double.cu ;
@@ -39,40 +42,47 @@ run test_exponential_quan_double.cu ;
run test_exponential_quan_float.cu ;
run test_exponential_range_support_double.cu ;
run test_exponential_range_support_float.cu ;
run test_extreme_value_cdf_double.cu ;
run test_extreme_value_cdf_float.cu ;
run test_extreme_value_pdf_double.cu ;
run test_extreme_value_pdf_float.cu ;
run test_extreme_value_quan_double.cu ;
run test_extreme_value_quan_float.cu ;
run test_holtsmark_cdf_double.cu ;
run test_holtsmark_cdf_float.cu ;
run test_holtsmark_pdf_double.cu ;
run test_holtsmark_pdf_float.cu ;
run test_landau_cdf_double.cu ;
run test_landau_cdf_float.cu ;
run test_landau_pdf_double.cu ;
run test_landau_pdf_float.cu ;
run test_landau_quan_double.cu;
run test_landau_quan_float.cu ;
run test_laplace_cdf_double.cu ;
run test_laplace_cdf_float.cu ;
run test_laplace_pdf_double.cu ;
run test_laplace_pdf_float.cu ;
run test_laplace_quan_double.cu ;
run test_laplace_quan_float.cu ;
run test_logistic_cdf_double.cu ;
run test_logistic_cdf_float.cu ;
run test_logistic_pdf_double.cu ;
run test_logistic_pdf_float.cu ;
run test_logistic_quan_double.cu ;
run test_logistic_quan_float.cu ;
run test_mapairy_cdf_double.cu ;
run test_mapairy_cdf_float.cu ;
run test_mapairy_pdf_double.cu ;
run test_mapairy_pdf_float.cu ;
run test_mapairy_quan_double.cu ;
run test_mapairy_quan_float.cu ;
run test_saspoint5_cdf_double.cu ;
run test_saspoint5_cdf_float.cu ;
run test_saspoint5_pdf_double.cu ;
@@ -81,17 +91,52 @@ run test_saspoint5_quan_double.cu ;
run test_saspoint5_quan_float.cu ;
# Special Functions
# run test_beta_simple.cpp ;
run test_beta_double.cu ;
run test_beta_float.cu ;
run test_bessel_i0_double.cu ;
run test_bessel_i0_float.cu ;
run test_bessel_i1_double.cu ;
run test_bessel_i1_float.cu ;
run test_bessel_j0_double.cu ;
run test_bessel_j0_float.cu ;
run test_bessel_j1_double.cu ;
run test_bessel_j1_float.cu ;
run test_bessel_k0_double.cu ;
run test_bessel_k0_float.cu ;
run test_bessel_k1_double.cu ;
run test_bessel_k1_float.cu ;
run test_bessel_kn_double.cu ;
run test_bessel_kn_float.cu ;
run test_bessel_y0_double.cu ;
run test_bessel_y0_float.cu ;
run test_bessel_y1_double.cu ;
run test_bessel_y1_float.cu ;
run test_cyl_bessel_i_double.cu ;
run test_cyl_bessel_i_float.cu ;
run test_cyl_bessel_j_double.cu ;
run test_cyl_bessel_j_float.cu ;
run test_cyl_bessel_k_double.cu ;
run test_cyl_bessel_k_float.cu ;
run test_sph_bessel_double.cu ;
run test_sph_bessel_float.cu ;
run test_cyl_neumann_double.cu ;
run test_cyl_neumann_float.cu ;
run test_sph_neumann_double.cu ;
run test_sph_neumann_float.cu ;
run test_cbrt_double.cu ;
run test_cbrt_float.cu ;
run test_changesign_double.cu ;
run test_changesign_float.cu ;
run test_cos_pi_double.cu ;
run test_cos_pi_float.cu ;
run test_digamma_double.cu ;
run test_digamma_float.cu ;
run test_erf_double.cu ;
run test_erf_float.cu ;
run test_erf_inv_double.cu ;
@@ -100,21 +145,29 @@ run test_erfc_double.cu ;
run test_erfc_float.cu ;
run test_erfc_inv_double.cu ;
run test_erfc_inv_float.cu ;
run test_expm1_double.cu ;
run test_expm1_float.cu ;
run test_lgamma_double.cu ;
run test_lgamma_float.cu ;
run test_log1p_double.cu ;
run test_log1p_float.cu ;
run test_modf_double.cu ;
run test_modf_float.cu ;
run test_round_double.cu ;
run test_round_float.cu ;
run test_sin_pi_double.cu ;
run test_sin_pi_float.cu ;
run test_tgamma_double.cu ;
run test_tgamma_float.cu ;
run test_log1p_double.cu ;
run test_log1p_float.cu ;
run test_modf_double.cu ;
run test_modf_float.cu ;
run test_round_double.cu ;
run test_round_float.cu ;
run test_sin_pi_double.cu ;
run test_sin_pi_float.cu ;
run test_trigamma_double.cu ;
run test_trigamma_float.cu ;
run test_trunc_double.cu ;
run test_trunc_float.cu ;

View File

@@ -90,12 +90,47 @@ run test_saspoint5_quan_nvrtc_float.cpp ;
# Special Functions
run test_beta_nvrtc_double.cpp ;
run test_beta_nvrtc_float.cpp ;
run test_bessel_i0_nvrtc_double.cpp ;
run test_bessel_i0_nvrtc_float.cpp ;
run test_bessel_i1_nvrtc_double.cpp ;
run test_bessel_i1_nvrtc_float.cpp ;
run test_bessel_j0_nvrtc_double.cpp ;
run test_bessel_j0_nvrtc_float.cpp ;
run test_bessel_j1_nvrtc_double.cpp ;
run test_bessel_j1_nvrtc_float.cpp ;
run test_bessel_k0_nvrtc_double.cpp ;
run test_bessel_k0_nvrtc_float.cpp ;
run test_bessel_k1_nvrtc_double.cpp ;
run test_bessel_k1_nvrtc_float.cpp ;
run test_bessel_kn_nvrtc_double.cpp ;
run test_bessel_kn_nvrtc_float.cpp ;
run test_bessel_y0_nvrtc_double.cpp ;
run test_bessel_y0_nvrtc_float.cpp ;
run test_bessel_y1_nvrtc_double.cpp ;
run test_bessel_y1_nvrtc_float.cpp ;
run test_cyl_bessel_i_nvrtc_double.cpp ;
run test_cyl_bessel_i_nvrtc_float.cpp ;
run test_cyl_bessel_j_nvrtc_double.cpp ;
run test_cyl_bessel_j_nvrtc_float.cpp ;
run test_cyl_bessel_k_nvrtc_double.cpp ;
run test_cyl_bessel_k_nvrtc_float.cpp ;
run test_sph_bessel_nvrtc_double.cpp ;
run test_sph_bessel_nvrtc_float.cpp ;
run test_cyl_neumann_nvrtc_double.cpp ;
run test_cyl_neumann_nvrtc_float.cpp ;
run test_sph_neumann_nvrtc_double.cpp ;
run test_sph_neumann_nvrtc_float.cpp ;
run test_cbrt_nvrtc_double.cpp ;
run test_cbrt_nvrtc_float.cpp ;
run test_cos_pi_nvrtc_double.cpp ;
run test_cos_pi_nvrtc_float.cpp ;
run test_digamma_nvrtc_double.cpp ;
run test_digamma_nvrtc_float.cpp ;
run test_erf_nvrtc_double.cpp ;
run test_erf_nvrtc_float.cpp ;
run test_erfc_nvrtc_double.cpp ;
@@ -104,22 +139,32 @@ run test_erf_inv_nvrtc_double.cpp ;
run test_erf_inv_nvrtc_float.cpp ;
run test_erfc_inv_nvrtc_double.cpp ;
run test_erfc_inv_nvrtc_float.cpp ;
run test_expm1_nvrtc_double.cpp ;
run test_expm1_nvrtc_float.cpp ;
run test_fpclassify_nvrtc_double.cpp ;
run test_fpclassify_nvrtc_float.cpp ;
run test_gamma_nvrtc_double.cpp ;
run test_gamma_nvrtc_float.cpp ;
run test_log1p_nvrtc_double.cpp ;
run test_log1p_nvrtc_float.cpp ;
run test_modf_nvrtc_double.cpp ;
run test_modf_nvrtc_float.cpp ;
run test_round_nvrtc_double.cpp ;
run test_round_nvrtc_float.cpp ;
run test_sign_nvrtc_double.cpp ;
run test_sign_nvrtc_float.cpp ;
run test_sin_pi_nvrtc_double.cpp ;
run test_sin_pi_nvrtc_float.cpp ;
run test_trigamma_nvrtc_double.cpp ;
run test_trigamma_nvrtc_float.cpp ;
run test_trunc_nvrtc_double.cpp ;

View File

@@ -25,6 +25,10 @@ run test_saspoint5.cpp ;
# Special Functions
run pow_test.cpp ;
run test_beta_simple.cpp ;
run test_bessel_i.cpp ;
run test_bessel_j.cpp ;
run test_bessel_k.cpp ;
run test_bessel_y.cpp ;
run test_cbrt.cpp ;
run test_sign.cpp ;
run test_round.cpp ;

View File

@@ -3,7 +3,13 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#else
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
#include <boost/math/tools/config.hpp>
#endif
#include "test_bessel_i.hpp"
//
@@ -82,7 +88,11 @@ void expected_results()
"linux", // platform
largest_type, // test type(s)
".*Random.*", // test data group
#ifdef SYCL_LANGUAGE_VERSION
".*", 600, 200);
#else
".*", 400, 200); // test function
#endif
add_expected_result(
"GNU.*", // compiler
@@ -111,7 +121,11 @@ void expected_results()
".*", // platform
largest_type, // test type(s)
".*", // test data group
#ifdef SYCL_LANGUAGE_VERSION
".*", 400, 200);
#else
".*", 20, 10); // test function
#endif
//
// Set error rates a little higher for real_concept -
// now that we use a series approximation for small z

View File

@@ -9,6 +9,7 @@
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/type_traits/is_floating_point.hpp>
#include <boost/array.hpp>
#include "functor.hpp"
@@ -180,7 +181,10 @@ void test_bessel(T, const char* name)
//
// Special cases for full coverage:
//
#ifndef BOOST_MATH_NO_EXCEPTIONS
BOOST_CHECK_THROW(boost::math::cyl_bessel_i(T(-2.5), T(-2.5)), std::domain_error);
#endif
BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(0), T(0)), T(1));
BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(10), T(0)), T(0));
BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(-10), T(0)), T(0));
@@ -197,10 +201,12 @@ void test_bessel(T, const char* name)
}
}
T tolerance = boost::math::tools::epsilon<T>() * 100;
#ifndef SYCL_LANGUAGE_VERSION
if ((boost::math::tools::digits<T>() <= std::numeric_limits<double>::digits) && (std::numeric_limits<T>::max_exponent > 1000))
{
BOOST_CHECK_CLOSE_FRACTION(boost::math::cyl_bessel_i(T(0.5), T(710)), SC_(3.3447452278080108123142599104927325061327359278058601201179e306), tolerance);
}
#endif
#if LDBL_MAX_EXP >= 11356
BOOST_IF_CONSTEXPR (std::numeric_limits<T>::max_exponent >= 11356)
{

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_i0(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_i0(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_i0(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_i0(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_i0.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_i0.hpp>
extern "C" __global__
void test_bessel_i0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_i0(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i0_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_i0_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i0_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_i0(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_i0.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_i0.hpp>
extern "C" __global__
void test_bessel_i0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_i0(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i0_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_i0_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i0_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_i0(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_i1(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_i1(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_i1(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_i1(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_i1.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_i1.hpp>
extern "C" __global__
void test_bessel_i1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_i1(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i1_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_i1_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i1_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_i1(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_i1.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_i1.hpp>
extern "C" __global__
void test_bessel_i1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_i1(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i1_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_i1_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i1_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_i1(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -3,7 +3,12 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#else
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
#include <boost/math/tools/config.hpp>
#endif
#include "test_bessel_j.hpp"

View File

@@ -9,6 +9,7 @@
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/type_traits/is_floating_point.hpp>
#include <boost/array.hpp>
@@ -279,7 +280,9 @@ void test_bessel(T, const char* name)
BOOST_MATH_CHECK_THROW(boost::math::sph_bessel(2, T(-2.0)), std::domain_error);
BOOST_CHECK_EQUAL(boost::math::cyl_bessel_j(T(0), T(2.5)), boost::math::cyl_bessel_j(T(0), T(-2.5)));
BOOST_CHECK_EQUAL(boost::math::cyl_bessel_j(T(1), T(2.5)), -boost::math::cyl_bessel_j(T(1), T(-2.5)));
#ifndef SYCL_LANGUAGE_VERSION
BOOST_CHECK_CLOSE_FRACTION(boost::math::cyl_bessel_j(364, T(38.5)), SC_(1.793940496519190500748409872348034004417458734118663909894e-309), tolerance);
#endif
//
// Special cases at infinity:
//

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_j0(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_j0(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_j0(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_j0(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_j0.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_j0.hpp>
extern "C" __global__
void test_bessel_j0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_j0(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j0_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_j0_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j0_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_j0(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_j0.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_j0.hpp>
extern "C" __global__
void test_bessel_j0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_j0(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j0_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_j0_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j0_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_j0(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_j1(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_j1(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_j1(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_j1(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_j1.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_j1.hpp>
extern "C" __global__
void test_bessel_j1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_j1(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j1_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_j1_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j1_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_j1(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_j1.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_j1.hpp>
extern "C" __global__
void test_bessel_j1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_j1(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j1_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_j1_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j1_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_j1(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -5,7 +5,12 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#else
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
#include <boost/math/tools/config.hpp>
#endif
#ifdef _MSC_VER
# pragma warning(disable : 4756) // overflow in constant arithmetic

View File

@@ -9,6 +9,7 @@
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/type_traits/is_floating_point.hpp>
#include <boost/array.hpp>
#include "functor.hpp"
@@ -175,6 +176,7 @@ void test_bessel(T, const char* name)
//
// Extra test coverage:
//
#ifndef SYCL_LANGUAGE_VERSION // SYCL doesn't throw
BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(2), T(-1)), std::domain_error);
BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(2.2), T(-1)), std::domain_error);
BOOST_IF_CONSTEXPR(std::numeric_limits<T>::has_infinity)
@@ -194,6 +196,7 @@ void test_bessel(T, const char* name)
BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(-1.25), T(0)), std::domain_error);
BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(-1), T(0)), std::domain_error);
BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(1), T(0)), std::domain_error);
#endif
}

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_k0(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_k0(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_k0(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_k0(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_k0.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_k0.hpp>
extern "C" __global__
void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_k0(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_k0(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_k0.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_k0.hpp>
extern "C" __global__
void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_k0(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_k0(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_k1(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_k1(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,100 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_k1(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::detail::bessel_k1(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_k1.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_k1.hpp>
extern "C" __global__
void test_bessel_k1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_k1(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k1_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_k1_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k1_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_k1(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_k1.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_k1.hpp>
extern "C" __global__
void test_bessel_k1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_k1(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k1_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_k1_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k1_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_k1(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,105 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_kn(2, in[i], pol);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
boost::math::policies::policy<> pol;
w.reset();
for(int i = 0; i < numElements; ++i)
{
results.push_back(boost::math::detail::bessel_kn(2, input_vector[i], pol));
}
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,105 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_kn(2, in[i], pol);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
boost::math::policies::policy<> pol;
w.reset();
for(int i = 0; i < numElements; ++i)
{
results.push_back(boost::math::detail::bessel_kn(2, input_vector[i], pol));
}
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,192 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_kn.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_kn.hpp>
extern "C" __global__
void test_bessel_kn_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_kn(2, in1[i], pol);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_kn_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_kn_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_kn_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
boost::math::policies::policy<> pol;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_kn(2, h_in1[i], pol);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,192 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/detail/bessel_kn.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/detail/bessel_kn.hpp>
extern "C" __global__
void test_bessel_kn_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_kn(2, in1[i], pol);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_kn_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_kn_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_kn_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
boost::math::policies::policy<> pol;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_kn(2, h_in1[i], pol);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -3,7 +3,12 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#else
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
#include <boost/math/tools/config.hpp>
#endif
#include "test_bessel_y.hpp"
@@ -234,7 +239,11 @@ void expected_results()
".*", // platform
largest_type, // test type(s)
".*(Y[nv]|y).*Random.*", // test data group
#ifdef SYCL_LANGUAGE_VERSION
".*", 2000, 1000);
#else
".*", 1500, 1000); // test function
#endif
//
// Fallback for sun has to go after the general cases above:
//

View File

@@ -9,6 +9,7 @@
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/type_traits/is_floating_point.hpp>
#include <boost/array.hpp>
@@ -241,10 +242,12 @@ void test_bessel(T, const char* name)
BOOST_CHECK_EQUAL(boost::math::sph_neumann(2, std::numeric_limits<T>::infinity()), T(0));
}
#ifndef BOOST_MATH_NO_EXCEPTIONS
BOOST_CHECK_THROW(boost::math::cyl_neumann(T(0), T(-1)), std::domain_error);
BOOST_CHECK_THROW(boost::math::cyl_neumann(T(0.2), T(-1)), std::domain_error);
BOOST_CHECK_THROW(boost::math::cyl_neumann(T(2), T(0)), std::domain_error);
BOOST_CHECK_THROW(boost::math::sph_neumann(2, T(-2)), std::domain_error);
#endif
#if LDBL_MAX_EXP > 1024
if (std::numeric_limits<T>::max_exponent > 1024)
{

View File

@@ -0,0 +1,106 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_y0(in[i], pol);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
boost::math::policies::policy<> pol;
for(int i = 0; i < numElements; ++i)
{
results.push_back(boost::math::detail::bessel_y0(input_vector[i], pol));
}
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,106 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_y0(in[i], pol);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
boost::math::policies::policy<> pol;
for(int i = 0; i < numElements; ++i)
{
results.push_back(boost::math::detail::bessel_y0(input_vector[i], pol));
}
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,194 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/policies/policy.hpp>
#include <boost/math/special_functions/detail/bessel_y0.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/policies/policy.hpp>
#include <boost/math/special_functions/detail/bessel_y0.hpp>
extern "C" __global__
void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_y0(in1[i], pol);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
boost::math::policies::policy<> pol;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_y0(h_in1[i], pol);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,194 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/policies/policy.hpp>
#include <boost/math/special_functions/detail/bessel_y0.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/policies/policy.hpp>
#include <boost/math/special_functions/detail/bessel_y0.hpp>
extern "C" __global__
void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_y0(in1[i], pol);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
boost::math::policies::policy<> pol;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_y0(h_in1[i], pol);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,106 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_y1(in[i], pol);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
boost::math::policies::policy<> pol;
for(int i = 0; i < numElements; ++i)
{
results.push_back(boost::math::detail::bessel_y1(input_vector[i], pol));
}
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,106 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_y1(in[i], pol);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
boost::math::policies::policy<> pol;
for(int i = 0; i < numElements; ++i)
{
results.push_back(boost::math::detail::bessel_y1(input_vector[i], pol));
}
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,194 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/policies/policy.hpp>
#include <boost/math/special_functions/detail/bessel_y1.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/policies/policy.hpp>
#include <boost/math/special_functions/detail/bessel_y1.hpp>
extern "C" __global__
void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_y1(in1[i], pol);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
boost::math::policies::policy<> pol;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_y1(h_in1[i], pol);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,194 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/policies/policy.hpp>
#include <boost/math/special_functions/detail/bessel_y1.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/policies/policy.hpp>
#include <boost/math/special_functions/detail/bessel_y1.hpp>
extern "C" __global__
void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
boost::math::policies::policy<> pol;
if (i < numElements)
{
out[i] = boost::math::detail::bessel_y1(in1[i], pol);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
boost::math::policies::policy<> pol;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::detail::bessel_y1(h_in1[i], pol);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,104 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::cyl_bessel_i(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,104 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::cyl_bessel_i(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/bessel.hpp>
extern "C" __global__
void test_cyl_bessel_i_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_i_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_bessel_i_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_i_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_bessel_i(h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/bessel.hpp>
extern "C" __global__
void test_cyl_bessel_i_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_i_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_bessel_i_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_i_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_bessel_i(h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,104 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::cyl_bessel_j(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,104 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::cyl_bessel_j(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/bessel.hpp>
extern "C" __global__
void test_cyl_bessel_j_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_bessel_j(h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/bessel.hpp>
extern "C" __global__
void test_cyl_bessel_j_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_bessel_j(h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,104 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::cyl_bessel_k(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,104 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::cyl_bessel_k(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/bessel.hpp>
extern "C" __global__
void test_cyl_bessel_k_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_k_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_bessel_k_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_k_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_bessel_k(h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/bessel.hpp>
extern "C" __global__
void test_cyl_bessel_k_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_k_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_bessel_k_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_k_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_bessel_k(h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,116 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_neumann(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::cyl_neumann(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (std::isfinite(output_vector[i]) && std::isfinite(results[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000)
{
std::cout << "error at line: " << i
<< "\nParallel: " << results[i]
<< "\n Serial: " << output_vector[i]
<< "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,104 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_neumann(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::cyl_neumann(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/bessel.hpp>
extern "C" __global__
void test_cyl_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_neumann(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_neumann_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_neumann_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_neumann_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_neumann(h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/bessel.hpp>
extern "C" __global__
void test_cyl_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_neumann(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_neumann_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_neumann_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_neumann_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_neumann(h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,119 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const unsigned *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_bessel(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<unsigned> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
std::mt19937_64 rng {42};
std::uniform_int_distribution<unsigned> order(1, 100);
std::uniform_real_distribution<float_type> val(0, 100);
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = order(rng);
input_vector2[i] = val(rng);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::sph_bessel(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
// check the results
bool failed = false;
for(int i = 0; i < numElements; ++i)
{
if (std::isfinite(output_vector[i]) && std::isfinite(results[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 3000)
{
std::cout << "error at line: " << i
<< "\nParallel: " << results[i]
<< "\n Serial: " << output_vector[i]
<< "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,119 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const unsigned *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_bessel(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<unsigned> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
std::mt19937_64 rng {42};
std::uniform_int_distribution<unsigned> order(1, 100);
std::uniform_real_distribution<float_type> val(0, 100);
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = order(rng);
input_vector2[i] = val(rng);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::sph_bessel(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
// check the results
bool failed = false;
for(int i = 0; i < numElements; ++i)
{
if (std::isfinite(output_vector[i]) && std::isfinite(results[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 150)
{
std::cout << "error at line: " << i
<< "\nParallel: " << results[i]
<< "\n Serial: " << output_vector[i]
<< "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,199 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/bessel.hpp>
extern "C" __global__
void test_cyl_bessel_j_kernel(const unsigned *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_bessel(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function");
int numElements = 5000;
unsigned *h_in1, *d_in1;
float_type *h_in2, *h_out;
float_type *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new unsigned[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_int_distribution<unsigned> order(1, 100);
std::uniform_real_distribution<float_type> val(0.0f, 100.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(order(rng));
h_in2[i] = static_cast<float_type>(val(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(unsigned)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(unsigned), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
bool failed = false;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::sph_bessel(h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 3000)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
failed = true;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
if (failed)
{
return 1;
}
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,199 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/bessel.hpp>
extern "C" __global__
void test_cyl_bessel_j_kernel(const unsigned *in1, const float_type *in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_bessel(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function");
int numElements = 5000;
unsigned *h_in1, *d_in1;
float_type *h_in2, *h_out;
float_type *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new unsigned[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_int_distribution<unsigned> order(1, 100);
std::uniform_real_distribution<float_type> val(0.0f, 100.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(order(rng));
h_in2[i] = static_cast<float_type>(val(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(unsigned)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(unsigned), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
bool failed = false;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::sph_bessel(h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 3000)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
failed = true;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
if (failed)
{
return 1;
}
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

Some files were not shown because too many files have changed in this diff Show More