2
0
mirror of https://github.com/boostorg/math.git synced 2026-01-19 04:22:09 +00:00

Add GPU support to expint

Add SYCL testing of expint

Add markers to forward decls

Add CUDA testing of expint

Fix static variable usage under NVRTC

Add NVRTC testing

Add configurable definition of complex

Add function aliases

Add GPU support to gegenbauer polynomials

Add SYCL testing of gegenbauer

Add NVCC testing of gegenbauer

Add NVRTC testing of gegenbauer

Add GPU support for hankel

Add SYCL testing of hankel

Add NVCC testing of cyl_hankel_1

Add comprehensive NVCC testing

Add NVRTC testing of cyl and sph hankel

Update docs

Fix writing cuda::std::complex<T> to stdout

Add GPU support to hermite

Add SYCL testing of hermite

Add CUDA testing of hermite

Add NVRTC testing of hermite

Add markers to hermite docs
This commit is contained in:
Matt Borland
2024-09-17 15:23:54 -04:00
parent c3afa49c9a
commit bbb8eee1d6
49 changed files with 4852 additions and 246 deletions

View File

@@ -18,10 +18,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
namespace boost { namespace math {
template <class T>
``__sf_result`` airy_ai(T x);
BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai(T x);
template <class T, class Policy>
``__sf_result`` airy_ai(T x, const Policy&);
BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai(T x, const Policy&);
}} // namespaces
@@ -78,10 +78,10 @@ This function is implemented in terms of the Bessel functions using the relation
namespace boost { namespace math {
template <class T>
``__sf_result`` airy_bi(T x);
BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi(T x);
template <class T, class Policy>
``__sf_result`` airy_bi(T x, const Policy&);
BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi(T x, const Policy&);
}} // namespaces
@@ -132,10 +132,10 @@ This function is implemented in terms of the Bessel functions using the relation
namespace boost { namespace math {
template <class T>
``__sf_result`` airy_ai_prime(T x);
BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai_prime(T x);
template <class T, class Policy>
``__sf_result`` airy_ai_prime(T x, const Policy&);
BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai_prime(T x, const Policy&);
}} // namespaces
@@ -186,10 +186,10 @@ This function is implemented in terms of the Bessel functions using the relation
namespace boost { namespace math {
template <class T>
``__sf_result`` airy_bi_prime(T x);
BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi_prime(T x);
template <class T, class Policy>
``__sf_result`` airy_bi_prime(T x, const Policy&);
BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi_prime(T x, const Policy&);
}} // namespaces
@@ -242,23 +242,23 @@ by providing an output iterator.
The signature of the single value functions are:
template <class T>
T airy_ai_zero(
BOOST_MATH_GPU_ENABLED T airy_ai_zero(
int m); // 1-based index of zero.
template <class T>
T airy_bi_zero(
BOOST_MATH_GPU_ENABLED T airy_bi_zero(
int m); // 1-based index of zero.
and for multiple zeros:
template <class T, class OutputIterator>
OutputIterator airy_ai_zero(
BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero(
int start_index, // 1-based index of first zero.
unsigned number_of_zeros, // How many zeros to generate.
OutputIterator out_it); // Destination for zeros.
template <class T, class OutputIterator>
OutputIterator airy_bi_zero(
BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero(
int start_index, // 1-based index of zero.
unsigned number_of_zeros, // How many zeros to generate
OutputIterator out_it); // Destination for zeros.
@@ -266,25 +266,25 @@ and for multiple zeros:
There are also versions which allow control of the __policy_section for error handling and precision.
template <class T>
T airy_ai_zero(
BOOST_MATH_GPU_ENABLED T airy_ai_zero(
int m, // 1-based index of zero.
const Policy&); // Policy to use.
template <class T>
T airy_bi_zero(
BOOST_MATH_GPU_ENABLED T airy_bi_zero(
int m, // 1-based index of zero.
const Policy&); // Policy to use.
template <class T, class OutputIterator>
OutputIterator airy_ai_zero(
BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero(
int start_index, // 1-based index of first zero.
unsigned number_of_zeros, // How many zeros to generate.
OutputIterator out_it, // Destination for zeros.
const Policy& pol); // Policy to use.
template <class T, class OutputIterator>
OutputIterator airy_bi_zero(
BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero(
int start_index, // 1-based index of zero.
unsigned number_of_zeros, // How many zeros to generate.
OutputIterator out_it, // Destination for zeros.

View File

@@ -11,10 +11,10 @@
namespace boost{ namespace math{
template <class T>
``__sf_result`` expint(unsigned n, T z);
BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z);
template <class T, class ``__Policy``>
``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
}} // namespaces
@@ -26,10 +26,10 @@ the return type is `double` if T is an integer type, and T otherwise.
[h4 Description]
template <class T>
``__sf_result`` expint(unsigned n, T z);
BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z);
template <class T, class ``__Policy``>
``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
Returns the [@http://mathworld.wolfram.com/En-Function.html exponential integral En]
of z:
@@ -100,10 +100,10 @@ is used.
namespace boost{ namespace math{
template <class T>
``__sf_result`` expint(T z);
BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z);
template <class T, class ``__Policy``>
``__sf_result`` expint(T z, const ``__Policy``&);
BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z, const ``__Policy``&);
}} // namespaces
@@ -115,10 +115,10 @@ the return type is `double` if T is an integer type, and T otherwise.
[h4 Description]
template <class T>
``__sf_result`` expint(T z);
BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z);
template <class T, class ``__Policy``>
``__sf_result`` expint(T z, const ``__Policy``&);
BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z, const ``__Policy``&);
Returns the [@http://mathworld.wolfram.com/ExponentialIntegral.html exponential integral]
of z:

View File

@@ -16,13 +16,13 @@
namespace boost{ namespace math{
template<typename Real>
Real gegenbauer(unsigned n, Real lambda, Real x);
BOOST_MATH_GPU_ENABLED Real gegenbauer(unsigned n, Real lambda, Real x);
template<typename Real>
Real gegenbauer_prime(unsigned n, Real lambda, Real x);
BOOST_MATH_GPU_ENABLED Real gegenbauer_prime(unsigned n, Real lambda, Real x);
template<typename Real>
Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k);
BOOST_MATH_GPU_ENABLED Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k);
}} // namespaces

View File

@@ -3,18 +3,36 @@
[h4 Synopsis]
template <class T1, class T2>
std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x);
template <class T1, class T2, class ``__Policy``>
std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&);
#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__)
template <class T1, class T2>
std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x);
template <class T1, class T2, class ``__Policy``>
BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&);
template <class T1, class T2>
BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x);
template <class T1, class T2, class ``__Policy``>
std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&);
BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&);
#else // When using cuda we use namespace cuda::std:: instead of std::
template <class T1, class T2>
BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x);
template <class T1, class T2, class ``__Policy``>
BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&);
template <class T1, class T2>
BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x);
template <class T1, class T2, class ``__Policy``>
BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&);
#endif
[h4 Description]
@@ -77,18 +95,35 @@ routines for integer order are used.
[h4 Synopsis]
template <class T1, class T2>
std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x);
template <class T1, class T2, class ``__Policy``>
std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&);
#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__)
template <class T1, class T2>
std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x);
template <class T1, class T2, class ``__Policy``>
BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&);
template <class T1, class T2>
BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x);
template <class T1, class T2, class ``__Policy``>
std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&);
BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&);
#else // When using cuda we use namespace cuda::std:: instead of std::
template <class T1, class T2>
BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x);
template <class T1, class T2, class ``__Policy``>
BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&);
template <class T1, class T2>
BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x);
template <class T1, class T2, class ``__Policy``>
BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&);
#endif
[h4 Description]

View File

@@ -9,13 +9,13 @@
namespace boost{ namespace math{
template <class T>
``__sf_result`` hermite(unsigned n, T x);
BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x);
template <class T, class ``__Policy``>
``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
template <class T1, class T2, class T3>
``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
}} // namespaces
@@ -26,10 +26,10 @@ note than when there is a single template argument the result is the same type
as that argument or `double` if the template argument is an integer type.
template <class T>
``__sf_result`` hermite(unsigned n, T x);
BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x);
template <class T, class ``__Policy``>
``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
Returns the value of the Hermite Polynomial of order /n/ at point /x/:
@@ -43,7 +43,7 @@ Hermite Polynomials:
[graph hermite]
template <class T1, class T2, class T3>
``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
Implements the three term recurrence relation for the Hermite
polynomials, this function can be used to create a sequence of

View File

@@ -47,50 +47,6 @@
namespace boost{ namespace math{
// Since we cannot pull this in from math fwd we need a copy
#ifdef BOOST_MATH_HAS_NVRTC
namespace detail{
typedef boost::math::integral_constant<int, 0> bessel_no_int_tag; // No integer optimisation possible.
typedef boost::math::integral_constant<int, 1> bessel_maybe_int_tag; // Maybe integer optimisation.
typedef boost::math::integral_constant<int, 2> bessel_int_tag; // Definite integer optimisation.
template <class T1, class T2, class Policy>
struct bessel_traits
{
using result_type = typename boost::math::conditional<
boost::math::is_integral<T1>::value,
typename tools::promote_args<T2>::type,
tools::promote_args_t<T1, T2>
>::type;
typedef typename policies::precision<result_type, Policy>::type precision_type;
using optimisation_tag = typename boost::math::conditional<
(precision_type::value <= 0 || precision_type::value > 64),
bessel_no_int_tag,
typename boost::math::conditional<
boost::math::is_integral<T1>::value,
bessel_int_tag,
bessel_maybe_int_tag
>::type
>::type;
using optimisation_tag128 = typename boost::math::conditional<
(precision_type::value <= 0 || precision_type::value > 113),
bessel_no_int_tag,
typename boost::math::conditional<
boost::math::is_integral<T1>::value,
bessel_int_tag,
bessel_maybe_int_tag
>::type
>::type;
};
} // detail
#endif
namespace detail{
template <class T, class Policy>

View File

@@ -1,4 +1,5 @@
// Copyright John Maddock 2007.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -12,6 +13,10 @@
#pragma warning(disable:4702) // Unreachable code (release mode only warning)
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/cstdint.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/tuple.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/tools/promotion.hpp>
#include <boost/math/tools/fraction.hpp>
@@ -20,7 +25,6 @@
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/digamma.hpp>
#include <boost/math/special_functions/log1p.hpp>
#include <boost/math/special_functions/pow.hpp>
#if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
//
@@ -35,13 +39,13 @@
namespace boost{ namespace math{
template <class T, class Policy>
inline typename tools::promote_args<T>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
expint(unsigned n, T z, const Policy& /*pol*/);
namespace detail{
template <class T>
inline T expint_1_rational(const T& z, const std::integral_constant<int, 0>&)
BOOST_MATH_GPU_ENABLED inline T expint_1_rational(const T& z, const boost::math::integral_constant<int, 0>&)
{
// this function is never actually called
BOOST_MATH_ASSERT(0);
@@ -49,7 +53,7 @@ inline T expint_1_rational(const T& z, const std::integral_constant<int, 0>&)
}
template <class T>
T expint_1_rational(const T& z, const std::integral_constant<int, 53>&)
BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant<int, 53>&)
{
BOOST_MATH_STD_USING
T result;
@@ -123,7 +127,7 @@ T expint_1_rational(const T& z, const std::integral_constant<int, 53>&)
}
template <class T>
T expint_1_rational(const T& z, const std::integral_constant<int, 64>&)
BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant<int, 64>&)
{
BOOST_MATH_STD_USING
T result;
@@ -204,7 +208,7 @@ T expint_1_rational(const T& z, const std::integral_constant<int, 64>&)
}
template <class T>
T expint_1_rational(const T& z, const std::integral_constant<int, 113>&)
BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant<int, 113>&)
{
BOOST_MATH_STD_USING
T result;
@@ -351,14 +355,15 @@ T expint_1_rational(const T& z, const std::integral_constant<int, 113>&)
return result;
}
template <class T>
struct expint_fraction
{
typedef std::pair<T,T> result_type;
expint_fraction(unsigned n_, T z_) : b(n_ + z_), i(-1), n(n_){}
std::pair<T,T> operator()()
typedef boost::math::pair<T,T> result_type;
BOOST_MATH_GPU_ENABLED expint_fraction(unsigned n_, T z_) : b(n_ + z_), i(-1), n(n_){}
BOOST_MATH_GPU_ENABLED boost::math::pair<T,T> operator()()
{
std::pair<T,T> result = std::make_pair(-static_cast<T>((i+1) * (n+i)), b);
boost::math::pair<T,T> result = boost::math::make_pair(-static_cast<T>((i+1) * (n+i)), b);
b += 2;
++i;
return result;
@@ -370,11 +375,11 @@ private:
};
template <class T, class Policy>
inline T expint_as_fraction(unsigned n, T z, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T expint_as_fraction(unsigned n, T z, const Policy& pol)
{
BOOST_MATH_STD_USING
BOOST_MATH_INSTRUMENT_VARIABLE(z)
std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
expint_fraction<T> f(n, z);
T result = tools::continued_fraction_b(
f,
@@ -392,9 +397,9 @@ template <class T>
struct expint_series
{
typedef T result_type;
expint_series(unsigned k_, T z_, T x_k_, T denom_, T fact_)
BOOST_MATH_GPU_ENABLED expint_series(unsigned k_, T z_, T x_k_, T denom_, T fact_)
: k(k_), z(z_), x_k(x_k_), denom(denom_), fact(fact_){}
T operator()()
BOOST_MATH_GPU_ENABLED T operator()()
{
x_k *= -z;
denom += 1;
@@ -410,10 +415,10 @@ private:
};
template <class T, class Policy>
inline T expint_as_series(unsigned n, T z, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T expint_as_series(unsigned n, T z, const Policy& pol)
{
BOOST_MATH_STD_USING
std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
BOOST_MATH_INSTRUMENT_VARIABLE(z)
@@ -443,10 +448,10 @@ inline T expint_as_series(unsigned n, T z, const Policy& pol)
}
template <class T, class Policy, class Tag>
T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag)
BOOST_MATH_GPU_ENABLED T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag)
{
BOOST_MATH_STD_USING
static const char* function = "boost::math::expint<%1%>(unsigned, %1%)";
constexpr auto function = "boost::math::expint<%1%>(unsigned, %1%)";
if(z < 0)
return policies::raise_domain_error<T>(function, "Function requires z >= 0 but got %1%.", z, pol);
if(z == 0)
@@ -468,15 +473,21 @@ T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag)
# pragma warning(disable:4127) // conditional expression is constant
#endif
if(n == 0)
{
result = exp(-z) / z;
}
else if((n == 1) && (Tag::value))
{
result = expint_1_rational(z, tag);
}
else if(f)
{
result = expint_as_series(n, z, pol);
}
else
{
result = expint_as_fraction(n, z, pol);
}
#ifdef _MSC_VER
# pragma warning(pop)
#endif
@@ -488,8 +499,8 @@ template <class T>
struct expint_i_series
{
typedef T result_type;
expint_i_series(T z_) : k(0), z_k(1), z(z_){}
T operator()()
BOOST_MATH_GPU_ENABLED expint_i_series(T z_) : k(0), z_k(1), z(z_){}
BOOST_MATH_GPU_ENABLED T operator()()
{
z_k *= z / ++k;
return z_k / k;
@@ -501,22 +512,22 @@ private:
};
template <class T, class Policy>
T expint_i_as_series(T z, const Policy& pol)
BOOST_MATH_GPU_ENABLED T expint_i_as_series(T z, const Policy& pol)
{
BOOST_MATH_STD_USING
T result = log(z); // (log(z) - log(1 / z)) / 2;
result += constants::euler<T>();
expint_i_series<T> s(z);
std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
result = tools::sum_series(s, policies::get_epsilon<T, Policy>(), max_iter, result);
policies::check_series_iterations<T>("boost::math::expint_i_series<%1%>(%1%)", max_iter, pol);
return result;
}
template <class T, class Policy, class Tag>
T expint_i_imp(T z, const Policy& pol, const Tag& tag)
BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const Tag& tag)
{
static const char* function = "boost::math::expint<%1%>(%1%)";
constexpr auto function = "boost::math::expint<%1%>(%1%)";
if(z < 0)
return -expint_imp(1, T(-z), pol, tag);
if(z == 0)
@@ -525,10 +536,10 @@ T expint_i_imp(T z, const Policy& pol, const Tag& tag)
}
template <class T, class Policy>
T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& tag)
BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant<int, 53>& tag)
{
BOOST_MATH_STD_USING
static const char* function = "boost::math::expint<%1%>(%1%)";
constexpr auto function = "boost::math::expint<%1%>(%1%)";
if(z < 0)
return -expint_imp(1, T(-z), pol, tag);
if(z == 0)
@@ -541,7 +552,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
// Maximum Deviation Found: 2.852e-18
// Expected Error Term: 2.852e-18
// Max Error found at double precision = Poly: 2.636335e-16 Cheb: 4.187027e-16
static const T P[10] = {
BOOST_MATH_STATIC const T P[10] = {
BOOST_MATH_BIG_CONSTANT(T, 53, 2.98677224343598593013),
BOOST_MATH_BIG_CONSTANT(T, 53, 0.356343618769377415068),
BOOST_MATH_BIG_CONSTANT(T, 53, 0.780836076283730801839),
@@ -553,7 +564,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
BOOST_MATH_BIG_CONSTANT(T, 53, 0.798296365679269702435e-5),
BOOST_MATH_BIG_CONSTANT(T, 53, 0.2777056254402008721e-6)
};
static const T Q[8] = {
BOOST_MATH_STATIC const T Q[8] = {
BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
BOOST_MATH_BIG_CONSTANT(T, 53, -1.17090412365413911947),
BOOST_MATH_BIG_CONSTANT(T, 53, 0.62215109846016746276),
@@ -564,11 +575,11 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
BOOST_MATH_BIG_CONSTANT(T, 53, -0.138972589601781706598e-4)
};
static const T c1 = BOOST_MATH_BIG_CONSTANT(T, 53, 1677624236387711.0);
static const T c2 = BOOST_MATH_BIG_CONSTANT(T, 53, 4503599627370496.0);
static const T r1 = static_cast<T>(c1 / c2);
static const T r2 = BOOST_MATH_BIG_CONSTANT(T, 53, 0.131401834143860282009280387409357165515556574352422001206362e-16);
static const T r = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 0.372507410781366634461991866580119133535689497771654051555657435242200120636201854384926049951548942392));
BOOST_MATH_STATIC_LOCAL_VARIABLE const T c1 = BOOST_MATH_BIG_CONSTANT(T, 53, 1677624236387711.0);
BOOST_MATH_STATIC_LOCAL_VARIABLE const T c2 = BOOST_MATH_BIG_CONSTANT(T, 53, 4503599627370496.0);
BOOST_MATH_STATIC_LOCAL_VARIABLE const T r1 = static_cast<T>(c1 / c2);
BOOST_MATH_STATIC_LOCAL_VARIABLE const T r2 = BOOST_MATH_BIG_CONSTANT(T, 53, 0.131401834143860282009280387409357165515556574352422001206362e-16);
BOOST_MATH_STATIC_LOCAL_VARIABLE const T r = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 0.372507410781366634461991866580119133535689497771654051555657435242200120636201854384926049951548942392));
T t = (z / 3) - 1;
result = tools::evaluate_polynomial(P, t)
/ tools::evaluate_polynomial(Q, t);
@@ -588,8 +599,8 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
// Maximum Deviation Found: 6.546e-17
// Expected Error Term: 6.546e-17
// Max Error found at double precision = Poly: 6.890169e-17 Cheb: 6.772128e-17
static const T Y = 1.158985137939453125F;
static const T P[8] = {
BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.158985137939453125F;
BOOST_MATH_STATIC const T P[8] = {
BOOST_MATH_BIG_CONSTANT(T, 53, 0.00139324086199402804173),
BOOST_MATH_BIG_CONSTANT(T, 53, -0.0349921221823888744966),
BOOST_MATH_BIG_CONSTANT(T, 53, -0.0264095520754134848538),
@@ -599,7 +610,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
BOOST_MATH_BIG_CONSTANT(T, 53, -0.554086272024881826253e-4),
BOOST_MATH_BIG_CONSTANT(T, 53, -0.396487648924804510056e-5)
};
static const T Q[8] = {
BOOST_MATH_STATIC const T Q[8] = {
BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
BOOST_MATH_BIG_CONSTANT(T, 53, 0.744625566823272107711),
BOOST_MATH_BIG_CONSTANT(T, 53, 0.329061095011767059236),
@@ -621,8 +632,8 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
// Expected Error Term: -1.842e-17
// Max Error found at double precision = Poly: 4.375868e-17 Cheb: 5.860967e-17
static const T Y = 1.0869731903076171875F;
static const T P[9] = {
BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.0869731903076171875F;
BOOST_MATH_STATIC const T P[9] = {
BOOST_MATH_BIG_CONSTANT(T, 53, -0.00893891094356945667451),
BOOST_MATH_BIG_CONSTANT(T, 53, -0.0484607730127134045806),
BOOST_MATH_BIG_CONSTANT(T, 53, -0.0652810444222236895772),
@@ -633,7 +644,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
BOOST_MATH_BIG_CONSTANT(T, 53, -0.000209750022660200888349),
BOOST_MATH_BIG_CONSTANT(T, 53, -0.138652200349182596186e-4)
};
static const T Q[9] = {
BOOST_MATH_STATIC const T Q[9] = {
BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
BOOST_MATH_BIG_CONSTANT(T, 53, 1.97017214039061194971),
BOOST_MATH_BIG_CONSTANT(T, 53, 1.86232465043073157508),
@@ -657,8 +668,8 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
// Max Error found at double precision = Poly: 1.441088e-16 Cheb: 1.864792e-16
static const T Y = 1.03937530517578125F;
static const T P[9] = {
BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.03937530517578125F;
BOOST_MATH_STATIC const T P[9] = {
BOOST_MATH_BIG_CONSTANT(T, 53, -0.00356165148914447597995),
BOOST_MATH_BIG_CONSTANT(T, 53, -0.0229930320357982333406),
BOOST_MATH_BIG_CONSTANT(T, 53, -0.0449814350482277917716),
@@ -669,7 +680,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
BOOST_MATH_BIG_CONSTANT(T, 53, -0.000192178045857733706044),
BOOST_MATH_BIG_CONSTANT(T, 53, -0.113161784705911400295e-9)
};
static const T Q[9] = {
BOOST_MATH_STATIC const T Q[9] = {
BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
BOOST_MATH_BIG_CONSTANT(T, 53, 2.84354408840148561131),
BOOST_MATH_BIG_CONSTANT(T, 53, 3.6599610090072393012),
@@ -688,9 +699,9 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
else
{
// Max Error found at double precision = 3.381886e-17
static const T exp40 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 2.35385266837019985407899910749034804508871617254555467236651e17));
static const T Y= 1.013065338134765625F;
static const T P[6] = {
BOOST_MATH_STATIC_LOCAL_VARIABLE const T exp40 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 2.35385266837019985407899910749034804508871617254555467236651e17));
BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y= 1.013065338134765625F;
BOOST_MATH_STATIC const T P[6] = {
BOOST_MATH_BIG_CONSTANT(T, 53, -0.0130653381347656243849),
BOOST_MATH_BIG_CONSTANT(T, 53, 0.19029710559486576682),
BOOST_MATH_BIG_CONSTANT(T, 53, 94.7365094537197236011),
@@ -698,7 +709,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
BOOST_MATH_BIG_CONSTANT(T, 53, 18932.0850014925993025),
BOOST_MATH_BIG_CONSTANT(T, 53, -38703.1431362056714134)
};
static const T Q[7] = {
BOOST_MATH_STATIC const T Q[7] = {
BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
BOOST_MATH_BIG_CONSTANT(T, 53, 61.9733592849439884145),
BOOST_MATH_BIG_CONSTANT(T, 53, -2354.56211323420194283),
@@ -739,10 +750,10 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
}
template <class T, class Policy>
T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 64>& tag)
BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant<int, 64>& tag)
{
BOOST_MATH_STD_USING
static const char* function = "boost::math::expint<%1%>(%1%)";
constexpr auto function = "boost::math::expint<%1%>(%1%)";
if(z < 0)
return -expint_imp(1, T(-z), pol, tag);
if(z == 0)
@@ -976,7 +987,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 64>& ta
}
template <class T, class Policy>
void expint_i_imp_113a(T& result, const T& z, const Policy& pol)
BOOST_MATH_GPU_ENABLED void expint_i_imp_113a(T& result, const T& z, const Policy& pol)
{
BOOST_MATH_STD_USING
// Maximum Deviation Found: 1.230e-36
@@ -1044,7 +1055,7 @@ void expint_i_imp_113a(T& result, const T& z, const Policy& pol)
}
template <class T>
void expint_i_113b(T& result, const T& z)
BOOST_MATH_GPU_ENABLED void expint_i_113b(T& result, const T& z)
{
BOOST_MATH_STD_USING
// Maximum Deviation Found: 7.779e-36
@@ -1094,7 +1105,7 @@ void expint_i_113b(T& result, const T& z)
}
template <class T>
void expint_i_113c(T& result, const T& z)
BOOST_MATH_GPU_ENABLED void expint_i_113c(T& result, const T& z)
{
BOOST_MATH_STD_USING
// Maximum Deviation Found: 1.082e-34
@@ -1147,7 +1158,7 @@ void expint_i_113c(T& result, const T& z)
}
template <class T>
void expint_i_113d(T& result, const T& z)
BOOST_MATH_GPU_ENABLED void expint_i_113d(T& result, const T& z)
{
BOOST_MATH_STD_USING
// Maximum Deviation Found: 3.163e-35
@@ -1198,7 +1209,7 @@ void expint_i_113d(T& result, const T& z)
}
template <class T>
void expint_i_113e(T& result, const T& z)
BOOST_MATH_GPU_ENABLED void expint_i_113e(T& result, const T& z)
{
BOOST_MATH_STD_USING
// Maximum Deviation Found: 7.972e-36
@@ -1252,7 +1263,7 @@ void expint_i_113e(T& result, const T& z)
}
template <class T>
void expint_i_113f(T& result, const T& z)
BOOST_MATH_GPU_ENABLED void expint_i_113f(T& result, const T& z)
{
BOOST_MATH_STD_USING
// Maximum Deviation Found: 4.469e-36
@@ -1299,7 +1310,7 @@ void expint_i_113f(T& result, const T& z)
}
template <class T>
void expint_i_113g(T& result, const T& z)
BOOST_MATH_GPU_ENABLED void expint_i_113g(T& result, const T& z)
{
BOOST_MATH_STD_USING
// Maximum Deviation Found: 5.588e-35
@@ -1344,7 +1355,7 @@ void expint_i_113g(T& result, const T& z)
}
template <class T>
void expint_i_113h(T& result, const T& z)
BOOST_MATH_GPU_ENABLED void expint_i_113h(T& result, const T& z)
{
BOOST_MATH_STD_USING
// Maximum Deviation Found: 4.448e-36
@@ -1383,10 +1394,10 @@ void expint_i_113h(T& result, const T& z)
}
template <class T, class Policy>
T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 113>& tag)
BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant<int, 113>& tag)
{
BOOST_MATH_STD_USING
static const char* function = "boost::math::expint<%1%>(%1%)";
constexpr auto function = "boost::math::expint<%1%>(%1%)";
if(z < 0)
return -expint_imp(1, T(-z), pol, tag);
if(z == 0)
@@ -1491,12 +1502,12 @@ struct expint_i_initializer
{
struct init
{
init()
BOOST_MATH_GPU_ENABLED init()
{
do_init(tag());
}
static void do_init(const std::integral_constant<int, 0>&){}
static void do_init(const std::integral_constant<int, 53>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 0>&){}
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 53>&)
{
boost::math::expint(T(5), Policy());
boost::math::expint(T(7), Policy());
@@ -1504,7 +1515,7 @@ struct expint_i_initializer
boost::math::expint(T(38), Policy());
boost::math::expint(T(45), Policy());
}
static void do_init(const std::integral_constant<int, 64>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
{
boost::math::expint(T(5), Policy());
boost::math::expint(T(7), Policy());
@@ -1512,7 +1523,7 @@ struct expint_i_initializer
boost::math::expint(T(38), Policy());
boost::math::expint(T(45), Policy());
}
static void do_init(const std::integral_constant<int, 113>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
{
boost::math::expint(T(5), Policy());
boost::math::expint(T(7), Policy());
@@ -1524,12 +1535,14 @@ struct expint_i_initializer
boost::math::expint(T(200), Policy());
boost::math::expint(T(220), Policy());
}
void force_instantiate()const{}
BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
};
static const init initializer;
static void force_instantiate()
BOOST_MATH_GPU_ENABLED static void force_instantiate()
{
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
initializer.force_instantiate();
#endif
}
};
@@ -1541,33 +1554,35 @@ struct expint_1_initializer
{
struct init
{
init()
BOOST_MATH_GPU_ENABLED init()
{
do_init(tag());
}
static void do_init(const std::integral_constant<int, 0>&){}
static void do_init(const std::integral_constant<int, 53>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 0>&){}
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 53>&)
{
boost::math::expint(1, T(0.5), Policy());
boost::math::expint(1, T(2), Policy());
}
static void do_init(const std::integral_constant<int, 64>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
{
boost::math::expint(1, T(0.5), Policy());
boost::math::expint(1, T(2), Policy());
}
static void do_init(const std::integral_constant<int, 113>&)
BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
{
boost::math::expint(1, T(0.5), Policy());
boost::math::expint(1, T(2), Policy());
boost::math::expint(1, T(6), Policy());
}
void force_instantiate()const{}
BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
};
static const init initializer;
static void force_instantiate()
BOOST_MATH_GPU_ENABLED static void force_instantiate()
{
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
initializer.force_instantiate();
#endif
}
};
@@ -1575,8 +1590,8 @@ template <class T, class Policy, class tag>
const typename expint_1_initializer<T, Policy, tag>::init expint_1_initializer<T, Policy, tag>::initializer;
template <class T, class Policy>
inline typename tools::promote_args<T>::type
expint_forwarder(T z, const Policy& /*pol*/, std::true_type const&)
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
expint_forwarder(T z, const Policy& /*pol*/, boost::math::true_type const&)
{
typedef typename tools::promote_args<T>::type result_type;
typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -1587,7 +1602,7 @@ inline typename tools::promote_args<T>::type
policies::promote_double<false>,
policies::discrete_quantile<>,
policies::assert_undefined<> >::type forwarding_policy;
typedef std::integral_constant<int,
typedef boost::math::integral_constant<int,
precision_type::value <= 0 ? 0 :
precision_type::value <= 53 ? 53 :
precision_type::value <= 64 ? 64 :
@@ -1603,8 +1618,8 @@ inline typename tools::promote_args<T>::type
}
template <class T>
inline typename tools::promote_args<T>::type
expint_forwarder(unsigned n, T z, const std::false_type&)
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
expint_forwarder(unsigned n, T z, const boost::math::false_type&)
{
return boost::math::expint(n, z, policies::policy<>());
}
@@ -1612,7 +1627,7 @@ expint_forwarder(unsigned n, T z, const std::false_type&)
} // namespace detail
template <class T, class Policy>
inline typename tools::promote_args<T>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
expint(unsigned n, T z, const Policy& /*pol*/)
{
typedef typename tools::promote_args<T>::type result_type;
@@ -1624,7 +1639,7 @@ inline typename tools::promote_args<T>::type
policies::promote_double<false>,
policies::discrete_quantile<>,
policies::assert_undefined<> >::type forwarding_policy;
typedef std::integral_constant<int,
typedef boost::math::integral_constant<int,
precision_type::value <= 0 ? 0 :
precision_type::value <= 53 ? 53 :
precision_type::value <= 64 ? 64 :
@@ -1641,7 +1656,7 @@ inline typename tools::promote_args<T>::type
}
template <class T, class U>
inline typename detail::expint_result<T, U>::type
BOOST_MATH_GPU_ENABLED inline typename detail::expint_result<T, U>::type
expint(T const z, U const u)
{
typedef typename policies::is_policy<U>::type tag_type;
@@ -1649,7 +1664,7 @@ inline typename detail::expint_result<T, U>::type
}
template <class T>
inline typename tools::promote_args<T>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
expint(T z)
{
return expint(z, policies::policy<>());

View File

@@ -1,4 +1,5 @@
// (C) Copyright Nick Thompson 2019.
// (C) Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -6,21 +7,25 @@
#ifndef BOOST_MATH_SPECIAL_GEGENBAUER_HPP
#define BOOST_MATH_SPECIAL_GEGENBAUER_HPP
#include <limits>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#ifndef BOOST_MATH_NO_EXCEPTIONS
#include <stdexcept>
#include <type_traits>
#endif
namespace boost { namespace math {
template<typename Real>
Real gegenbauer(unsigned n, Real lambda, Real x)
BOOST_MATH_GPU_ENABLED Real gegenbauer(unsigned n, Real lambda, Real x)
{
static_assert(!std::is_integral<Real>::value, "Gegenbauer polynomials required floating point arguments.");
static_assert(!boost::math::is_integral<Real>::value, "Gegenbauer polynomials required floating point arguments.");
if (lambda <= -1/Real(2)) {
#ifndef BOOST_MATH_NO_EXCEPTIONS
throw std::domain_error("lambda > -1/2 is required.");
#else
return std::numeric_limits<Real>::quiet_NaN();
return boost::math::numeric_limits<Real>::quiet_NaN();
#endif
}
// The only reason to do this is because of some instability that could be present for x < 0 that is not present for x > 0.
@@ -41,7 +46,7 @@ Real gegenbauer(unsigned n, Real lambda, Real x)
Real yk = y1;
Real k = 2;
Real k_max = n*(1+std::numeric_limits<Real>::epsilon());
Real k_max = n*(1+boost::math::numeric_limits<Real>::epsilon());
Real gamma = 2*(lambda - 1);
while(k < k_max)
{
@@ -55,7 +60,7 @@ Real gegenbauer(unsigned n, Real lambda, Real x)
template<typename Real>
Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k)
BOOST_MATH_GPU_ENABLED Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k)
{
if (k > n) {
return Real(0);
@@ -70,7 +75,7 @@ Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k)
}
template<typename Real>
Real gegenbauer_prime(unsigned n, Real lambda, Real x) {
BOOST_MATH_GPU_ENABLED Real gegenbauer_prime(unsigned n, Real lambda, Real x) {
return gegenbauer_derivative<Real>(n, lambda, x, 1);
}

View File

@@ -1,4 +1,5 @@
// Copyright John Maddock 2012.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt
@@ -7,26 +8,31 @@
#ifndef BOOST_MATH_HANKEL_HPP
#define BOOST_MATH_HANKEL_HPP
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/detail/iconv.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/math/policies/error_handling.hpp>
namespace boost{ namespace math{
namespace detail{
template <class T, class Policy>
std::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign)
BOOST_MATH_GPU_ENABLED boost::math::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign)
{
BOOST_MATH_STD_USING
static const char* function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)";
constexpr auto function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)";
if(x < 0)
{
bool isint_v = floor(v) == v;
T j, y;
bessel_jy(v, -x, &j, &y, need_j | need_y, pol);
std::complex<T> cx(x), cv(v);
std::complex<T> j_result, y_result;
boost::math::complex<T> cx(x), cv(v);
boost::math::complex<T> j_result, y_result;
if(isint_v)
{
int s = (iround(v) & 1) ? -1 : 1;
@@ -37,12 +43,12 @@ std::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol
{
j_result = pow(cx, v) * pow(-cx, -v) * j;
T p1 = pow(-x, v);
std::complex<T> p2 = pow(cx, v);
boost::math::complex<T> p2 = pow(cx, v);
y_result = p1 * y / p2
+ (p2 / p1 - p1 / p2) * j / tan(constants::pi<T>() * v);
}
// multiply y_result by i:
y_result = std::complex<T>(-sign * y_result.imag(), sign * y_result.real());
y_result = boost::math::complex<T>(-sign * y_result.imag(), sign * y_result.real());
return j_result + y_result;
}
@@ -51,25 +57,25 @@ std::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol
if(v == 0)
{
// J is 1, Y is -INF
return std::complex<T>(1, sign * -policies::raise_overflow_error<T>(function, nullptr, pol));
return boost::math::complex<T>(1, sign * -policies::raise_overflow_error<T>(function, nullptr, pol));
}
else
{
// At least one of J and Y is complex infinity:
return std::complex<T>(policies::raise_overflow_error<T>(function, nullptr, pol), sign * policies::raise_overflow_error<T>(function, nullptr, pol));
return boost::math::complex<T>(policies::raise_overflow_error<T>(function, nullptr, pol), sign * policies::raise_overflow_error<T>(function, nullptr, pol));
}
}
T j, y;
bessel_jy(v, x, &j, &y, need_j | need_y, pol);
return std::complex<T>(j, sign * y);
return boost::math::complex<T>(j, sign * y);
}
template <class T, class Policy>
std::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign);
BOOST_MATH_GPU_ENABLED boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign);
template <class T, class Policy>
inline std::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign)
{
BOOST_MATH_STD_USING // ADL of std names.
int ival = detail::iconv(v, pol);
@@ -81,57 +87,57 @@ inline std::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const P
}
template <class T, class Policy>
inline std::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign)
{
BOOST_MATH_STD_USING
if((std::abs(v) < 200) && (x > 0))
return std::complex<T>(bessel_jn(v, x, pol), sign * bessel_yn(v, x, pol));
if((abs(v) < 200) && (x > 0))
return boost::math::complex<T>(bessel_jn(v, x, pol), sign * bessel_yn(v, x, pol));
return hankel_imp(static_cast<T>(v), x, bessel_no_int_tag(), pol, sign);
}
template <class T, class Policy>
inline std::complex<T> sph_hankel_imp(T v, T x, const Policy& pol, int sign)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<T> sph_hankel_imp(T v, T x, const Policy& pol, int sign)
{
BOOST_MATH_STD_USING
return constants::root_half_pi<T>() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(std::complex<T>(x));
return constants::root_half_pi<T>() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(boost::math::complex<T>(x));
}
} // namespace detail
template <class T1, class T2, class Policy>
inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
typedef typename detail::bessel_traits<T1, T2, Policy>::optimisation_tag tag_type;
typedef typename policies::evaluation<result_type, Policy>::type value_type;
return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, 1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, 1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
}
template <class T1, class T2>
inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x)
{
return cyl_hankel_1(v, x, policies::policy<>());
}
template <class T1, class T2, class Policy>
inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
typedef typename detail::bessel_traits<T1, T2, Policy>::optimisation_tag tag_type;
typedef typename policies::evaluation<result_type, Policy>::type value_type;
return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, -1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, -1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
}
template <class T1, class T2>
inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x)
{
return cyl_hankel_2(v, x, policies::policy<>());
}
template <class T1, class T2, class Policy>
inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy&)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy&)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -143,17 +149,17 @@ inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type>
policies::discrete_quantile<>,
policies::assert_undefined<> >::type forwarding_policy;
return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), 1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), 1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
}
template <class T1, class T2>
inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x)
{
return sph_hankel_1(v, x, policies::policy<>());
}
template <class T1, class T2, class Policy>
inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy&)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy&)
{
BOOST_FPU_EXCEPTION_GUARD
typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -165,11 +171,11 @@ inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type>
policies::discrete_quantile<>,
policies::assert_undefined<> >::type forwarding_policy;
return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), -1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), -1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
}
template <class T1, class T2>
inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x)
BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x)
{
return sph_hankel_2(v, x, policies::policy<>());
}

View File

@@ -1,5 +1,6 @@
// (C) Copyright John Maddock 2006.
// (C) Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -11,8 +12,9 @@
#pragma once
#endif
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/promotion.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/policies/error_handling.hpp>
namespace boost{
@@ -20,7 +22,7 @@ namespace math{
// Recurrence relation for Hermite polynomials:
template <class T1, class T2, class T3>
inline typename tools::promote_args<T1, T2, T3>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3>::type
hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1)
{
using promoted_type = tools::promote_args_t<T1, T2, T3>;
@@ -31,7 +33,7 @@ namespace detail{
// Implement Hermite polynomials via recurrence:
template <class T>
T hermite_imp(unsigned n, T x)
BOOST_MATH_GPU_ENABLED T hermite_imp(unsigned n, T x)
{
T p0 = 1;
T p1 = 2 * x;
@@ -43,7 +45,7 @@ T hermite_imp(unsigned n, T x)
while(c < n)
{
std::swap(p0, p1);
BOOST_MATH_GPU_SAFE_SWAP(p0, p1);
p1 = static_cast<T>(hermite_next(c, x, p0, p1));
++c;
}
@@ -53,7 +55,7 @@ T hermite_imp(unsigned n, T x)
} // namespace detail
template <class T, class Policy>
inline typename tools::promote_args<T>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
hermite(unsigned n, T x, const Policy&)
{
typedef typename tools::promote_args<T>::type result_type;
@@ -62,7 +64,7 @@ inline typename tools::promote_args<T>::type
}
template <class T>
inline typename tools::promote_args<T>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
hermite(unsigned n, T x)
{
return boost::math::hermite(n, x, policies::policy<>());

View File

@@ -27,6 +27,7 @@
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/promotion.hpp> // for argument promotion.
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/complex.hpp>
#include <boost/math/policies/policy.hpp>
#ifdef BOOST_MATH_HAS_NVRTC
@@ -50,7 +51,53 @@ namespace detail{
>::type;
};
} // namespace detail
template <class T, class U>
struct expint_result
{
using type = typename boost::math::conditional<
policies::is_policy<U>::value,
tools::promote_args_t<T>,
typename tools::promote_args<U>::type
>::type;
};
typedef boost::math::integral_constant<int, 0> bessel_no_int_tag; // No integer optimisation possible.
typedef boost::math::integral_constant<int, 1> bessel_maybe_int_tag; // Maybe integer optimisation.
typedef boost::math::integral_constant<int, 2> bessel_int_tag; // Definite integer optimisation.
template <class T1, class T2, class Policy>
struct bessel_traits
{
using result_type = typename boost::math::conditional<
boost::math::is_integral<T1>::value,
typename tools::promote_args<T2>::type,
tools::promote_args_t<T1, T2>
>::type;
typedef typename policies::precision<result_type, Policy>::type precision_type;
using optimisation_tag = typename boost::math::conditional<
(precision_type::value <= 0 || precision_type::value > 64),
bessel_no_int_tag,
typename boost::math::conditional<
boost::math::is_integral<T1>::value,
bessel_int_tag,
bessel_maybe_int_tag
>::type
>::type;
using optimisation_tag128 = typename boost::math::conditional<
(precision_type::value <= 0 || precision_type::value > 113),
bessel_no_int_tag,
typename boost::math::conditional<
boost::math::is_integral<T1>::value,
bessel_int_tag,
bessel_maybe_int_tag
>::type
>::type;
};
} // namespace detail
} // namespace math
} // namespace boost
@@ -284,15 +331,15 @@ namespace boost
laguerre(unsigned n, T1 m, T2 x);
template <class T>
tools::promote_args_t<T>
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T>
hermite(unsigned n, T x);
template <class T, class Policy>
tools::promote_args_t<T>
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T>
hermite(unsigned n, T x, const Policy& pol);
template <class T1, class T2, class T3>
tools::promote_args_t<T1, T2, T3>
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2, T3>
hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
template<class T1, class T2, class T3>
@@ -808,28 +855,28 @@ namespace boost
const Policy&);
template <class T1, class T2>
std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x);
template <class T1, class T2, class Policy>
std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol);
template <class T1, class T2, class Policy>
std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol);
template <class T1, class T2>
std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x);
template <class T1, class T2, class Policy>
std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy& pol);
template <class T1, class T2>
std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x);
template <class T1, class T2, class Policy>
std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy& pol);
BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy& pol);
template <class T1, class T2>
std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x);
BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x);
template <class T, class Policy>
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> airy_ai(T x, const Policy&);
@@ -936,7 +983,7 @@ namespace boost
template <class T, class U>
struct expint_result
{
typedef typename std::conditional<
typedef typename boost::math::conditional<
policies::is_policy<U>::value,
tools::promote_args_t<T>,
typename tools::promote_args<U>::type
@@ -946,13 +993,13 @@ namespace boost
} // namespace detail
template <class T, class Policy>
tools::promote_args_t<T> expint(unsigned n, T z, const Policy&);
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> expint(unsigned n, T z, const Policy&);
template <class T, class U>
typename detail::expint_result<T, U>::type expint(T const z, U const u);
BOOST_MATH_GPU_ENABLED typename detail::expint_result<T, U>::type expint(T const z, U const u);
template <class T>
tools::promote_args_t<T> expint(T z);
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> expint(T z);
// Zeta:
template <class T, class Policy>
@@ -1346,7 +1393,7 @@ namespace boost
laguerre(unsigned n, T1 m, T2 x) { return ::boost::math::laguerre(n, m, x, Policy()); }\
\
template <class T>\
inline boost::math::tools::promote_args_t<T> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> \
hermite(unsigned n, T x){ return ::boost::math::hermite(n, x, Policy()); }\
\
using boost::math::hermite_next;\
@@ -1620,11 +1667,11 @@ template <class OutputIterator, class T>\
using boost::math::changesign;\
\
template <class T, class U>\
inline typename boost::math::tools::promote_args_t<T,U> expint(T const& z, U const& u)\
BOOST_MATH_GPU_ENABLED inline typename boost::math::tools::promote_args_t<T,U> expint(T const& z, U const& u)\
{ return boost::math::expint(z, u, Policy()); }\
\
template <class T>\
inline boost::math::tools::promote_args_t<T> expint(T z){ return boost::math::expint(z, Policy()); }\
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> expint(T z){ return boost::math::expint(z, Policy()); }\
\
template <class T>\
inline boost::math::tools::promote_args_t<T> zeta(T s){ return boost::math::zeta(s, Policy()); }\
@@ -1669,19 +1716,19 @@ template <class OutputIterator, class T>\
inline boost::math::tools::promote_args_t<RT1, RT2> owens_t(RT1 a, RT2 z){ return boost::math::owens_t(a, z, Policy()); }\
\
template <class T1, class T2>\
inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_1(T1 v, T2 x)\
inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_1(T1 v, T2 x)\
{ return boost::math::cyl_hankel_1(v, x, Policy()); }\
\
template <class T1, class T2>\
inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_2(T1 v, T2 x)\
inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_2(T1 v, T2 x)\
{ return boost::math::cyl_hankel_2(v, x, Policy()); }\
\
template <class T1, class T2>\
inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_1(T1 v, T2 x)\
inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_1(T1 v, T2 x)\
{ return boost::math::sph_hankel_1(v, x, Policy()); }\
\
template <class T1, class T2>\
inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_2(T1 v, T2 x)\
inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_2(T1 v, T2 x)\
{ return boost::math::sph_hankel_2(v, x, Policy()); }\
\
template <class T>\

View File

@@ -14,11 +14,93 @@
#include <boost/math/tools/is_detected.hpp>
#ifdef BOOST_MATH_ENABLE_CUDA
#include <cuda/std/utility>
#endif
#ifndef BOOST_MATH_HAS_NVRTC
#include <cuda/std/utility>
#include <cuda/std/complex>
namespace boost {
namespace math {
template <typename T>
using complex = cuda::std::complex<T>;
using cuda::std::real;
using cuda::std::imag;
using cuda::std::abs;
using cuda::std::arg;
using cuda::std::norm;
using cuda::std::conj;
using cuda::std::proj;
using cuda::std::polar;
using cuda::std::exp;
using cuda::std::log;
using cuda::std::log10;
using cuda::std::pow;
using cuda::std::sqrt;
using cuda::std::sin;
using cuda::std::cos;
using cuda::std::tan;
using cuda::std::asin;
using cuda::std::acos;
using cuda::std::atan;
using cuda::std::sinh;
using cuda::std::cosh;
using cuda::std::tanh;
using cuda::std::asinh;
using cuda::std::acosh;
using cuda::std::atanh;
} // namespace math
} // namespace boost
#else
#include <utility>
#include <complex>
namespace boost {
namespace math {
template <typename T>
using complex = std::complex<T>;
using std::real;
using std::imag;
using std::abs;
using std::arg;
using std::norm;
using std::conj;
using std::proj;
using std::polar;
using std::exp;
using std::log;
using std::log10;
using std::pow;
using std::sqrt;
using std::sin;
using std::cos;
using std::tan;
using std::asin;
using std::acos;
using std::atan;
using std::sinh;
using std::cosh;
using std::tanh;
using std::asinh;
using std::acosh;
using std::atanh;
} // namespace math
} // namespace boost
#endif
namespace boost {

View File

@@ -678,6 +678,7 @@ namespace boost{ namespace math{
#include <cuda/std/cstdint>
#include <cuda/std/array>
#include <cuda/std/tuple>
#include <cuda/std/complex>
# define BOOST_MATH_CUDA_ENABLED __host__ __device__
# define BOOST_MATH_HAS_GPU_SUPPORT

View File

@@ -304,6 +304,14 @@ run test_cyl_neumann_double.cu ;
run test_cyl_neumann_float.cu ;
run test_sph_neumann_double.cu ;
run test_sph_neumann_float.cu ;
run test_cyl_hankel_1_double.cu ;
run test_cyl_hankel_1_float.cu ;
run test_cyl_hankel_2_double.cu ;
run test_cyl_hankel_2_float.cu ;
run test_sph_hankel_1_double.cu ;
run test_sph_hankel_1_float.cu ;
run test_sph_hankel_2_double.cu ;
run test_sph_hankel_2_float.cu ;
run test_cbrt_double.cu ;
run test_cbrt_float.cu ;
@@ -339,9 +347,18 @@ run test_erfc_float.cu ;
run test_erfc_inv_double.cu ;
run test_erfc_inv_float.cu ;
run test_expint_double.cu ;
run test_expint_float.cu ;
run test_expm1_double.cu ;
run test_expm1_float.cu ;
run test_gegenbauer_double.cu ;
run test_gegenbauer_float.cu ;
run test_hermite_double.cu ;
run test_hermite_float.cu ;
run test_lgamma_double.cu ;
run test_lgamma_float.cu ;
run test_tgamma_double.cu ;

View File

@@ -9,9 +9,6 @@ project : requirements
[ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
;
run test_heumann_lambda_nvrtc_double.cpp ;
run test_heumann_lambda_nvrtc_float.cpp ;
# Quad
run test_exp_sinh_quad_nvrtc_float.cpp ;
run test_exp_sinh_quad_nvrtc_double.cpp ;
@@ -306,6 +303,14 @@ run test_cyl_neumann_nvrtc_double.cpp ;
run test_cyl_neumann_nvrtc_float.cpp ;
run test_sph_neumann_nvrtc_double.cpp ;
run test_sph_neumann_nvrtc_float.cpp ;
run test_cyl_hankel_1_nvrtc_double.cpp ;
run test_cyl_hankel_1_nvrtc_float.cpp ;
run test_cyl_hankel_2_nvrtc_double.cpp ;
run test_cyl_hankel_2_nvrtc_float.cpp ;
run test_sph_hankel_1_nvrtc_double.cpp ;
run test_sph_hankel_1_nvrtc_float.cpp ;
run test_sph_hankel_2_nvrtc_double.cpp ;
run test_sph_hankel_2_nvrtc_float.cpp ;
run test_cbrt_nvrtc_double.cpp ;
run test_cbrt_nvrtc_float.cpp ;
@@ -335,6 +340,11 @@ run test_ellint_d_nvrtc_double.cpp ;
run test_ellint_d_nvrtc_float.cpp ;
run test_jacobi_zeta_nvrtc_double.cpp ;
run test_jacobi_zeta_nvrtc_float.cpp ;
run test_heumann_lambda_nvrtc_double.cpp ;
run test_heumann_lambda_nvrtc_float.cpp ;
run test_expint_nvrtc_double.cpp ;
run test_expint_nvrtc_float.cpp ;
run test_expm1_nvrtc_double.cpp ;
run test_expm1_nvrtc_float.cpp ;
@@ -351,6 +361,12 @@ run test_gamma_p_inv_nvrtc_float.cpp ;
run test_tgamma_ratio_nvrtc_double.cpp ;
run test_tgamma_ratio_nvrtc_float.cpp ;
run test_gegenbauer_nvrtc_double.cpp ;
run test_gegenbauer_nvrtc_float.cpp ;
run test_hermite_nvrtc_double.cpp ;
run test_hermite_nvrtc_float.cpp ;
run test_log1p_nvrtc_double.cpp ;
run test_log1p_nvrtc_float.cpp ;

View File

@@ -71,8 +71,14 @@ run test_sign.cpp ;
run test_round.cpp ;
run test_expint.cpp ;
run test_expm1_simple.cpp ;
run gegenbauer_test.cpp ;
run test_hankel.cpp ;
run test_log1p_simple.cpp ;
run test_digamma_simple.cpp ;
@@ -85,3 +91,5 @@ run test_gamma.cpp ;
run test_igamma.cpp ;
run test_igamma_inv.cpp ;
run test_igamma_inva.cpp ;
run test_hermite.cpp ;

View File

@@ -0,0 +1,119 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<boost::math::complex<float_type>> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results[i] = boost::math::cyl_hankel_1(input_vector1[i], input_vector2[i]);
double t = w.elapsed();
// check the results
int failure_counter = 0;
for(int i = 0; i < numElements; ++i)
{
const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
if (eps > 10)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
<< "\n Host: " << results[i].real() << ", " << results[i].imag()
<< "\n Eps: " << eps << std::endl;
++failure_counter;
if (failure_counter > 100)
{
break;
}
}
}
if (failure_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,119 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<boost::math::complex<float_type>> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results[i] = boost::math::cyl_hankel_1(input_vector1[i], input_vector2[i]);
double t = w.elapsed();
// check the results
int failure_counter = 0;
for(int i = 0; i < numElements; ++i)
{
const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
if (eps > 10)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
<< "\n Host: " << results[i].real() << ", " << results[i].imag()
<< "\n Eps: " << eps << std::endl;
++failure_counter;
if (failure_counter > 100)
{
break;
}
}
}
if (failure_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,199 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions/hankel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/hankel.hpp>
extern "C" __global__
void test_cyl_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_1_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_hankel_1_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_1_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2;
float_type *d_in1, *d_in2;
boost::math::complex<float_type> *h_out, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new boost::math::complex<float_type>[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
int fail_counter = 0;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_hankel_1(h_in1[i], h_in2[i]);
if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
<< "\n Serial: " << res.real() << ", " << res.imag()
<< "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
++fail_counter;
if (fail_counter > 100)
{
break;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
if (fail_counter > 0)
{
return 1;
}
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,199 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions/hankel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/hankel.hpp>
extern "C" __global__
void test_cyl_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_1_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_hankel_1_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_1_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2;
float_type *d_in1, *d_in2;
boost::math::complex<float_type> *h_out, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new boost::math::complex<float_type>[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
int fail_counter = 0;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_hankel_1(h_in1[i], h_in2[i]);
if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
<< "\n Serial: " << res.real() << ", " << res.imag()
<< "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
++fail_counter;
if (fail_counter > 100)
{
break;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
if (fail_counter > 0)
{
return 1;
}
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,119 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<boost::math::complex<float_type>> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results[i] = boost::math::cyl_hankel_2(input_vector1[i], input_vector2[i]);
double t = w.elapsed();
// check the results
int failure_counter = 0;
for(int i = 0; i < numElements; ++i)
{
const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
if (eps > 10)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
<< "\n Host: " << results[i].real() << ", " << results[i].imag()
<< "\n Eps: " << eps << std::endl;
++failure_counter;
if (failure_counter > 100)
{
break;
}
}
}
if (failure_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,119 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<boost::math::complex<float_type>> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results[i] = boost::math::cyl_hankel_2(input_vector1[i], input_vector2[i]);
double t = w.elapsed();
// check the results
int failure_counter = 0;
for(int i = 0; i < numElements; ++i)
{
const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
if (eps > 10)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
<< "\n Host: " << results[i].real() << ", " << results[i].imag()
<< "\n Eps: " << eps << std::endl;
++failure_counter;
if (failure_counter > 100)
{
break;
}
}
}
if (failure_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,199 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions/hankel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/hankel.hpp>
extern "C" __global__
void test_cyl_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_2_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_hankel_2_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_2_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2;
float_type *d_in1, *d_in2;
boost::math::complex<float_type> *h_out, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new boost::math::complex<float_type>[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
int fail_counter = 0;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_hankel_2(h_in1[i], h_in2[i]);
if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
<< "\n Serial: " << res.real() << ", " << res.imag()
<< "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
++fail_counter;
if (fail_counter > 100)
{
break;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
if (fail_counter > 0)
{
return 1;
}
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,199 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions/hankel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/hankel.hpp>
extern "C" __global__
void test_cyl_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_2_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_cyl_hankel_2_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_2_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2;
float_type *d_in1, *d_in2;
boost::math::complex<float_type> *h_out, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new boost::math::complex<float_type>[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
int fail_counter = 0;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::cyl_hankel_2(h_in1[i], h_in2[i]);
if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
<< "\n Serial: " << res.real() << ", " << res.imag()
<< "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
++fail_counter;
if (fail_counter > 100)
{
break;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
if (fail_counter > 0)
{
return 1;
}
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -3,7 +3,14 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#endif
#ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#endif
#include "test_expint.hpp"
//
@@ -78,7 +85,11 @@ void expected_results()
".*", // platform
"float|double|long double", // test type(s)
".*Ei.*", // test data group
#ifndef SYCL_LANGUAGE_VERSION
".*", 6, 3); // test function
#else
".*", 10, 3);
#endif
if(std::numeric_limits<long double>::digits > 100)
{
add_expected_result(

View File

@@ -4,13 +4,19 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <boost/math/tools/config.hpp>
#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
#include <boost/math/concepts/real_concept.hpp>
#endif
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/expint.hpp>
#define BOOST_TEST_MAIN
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
#include <boost/math/tools/stats.hpp>
#include <boost/math/tools/test.hpp>
#include "../include_private/boost/math/tools/test.hpp"
#include <boost/math/constants/constants.hpp>
#include <boost/type_traits/is_floating_point.hpp>
#include <boost/array.hpp>

106
test/test_expint_double.cu Normal file
View File

@@ -0,0 +1,106 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::expint(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = dist(rng);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::expint(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (std::isfinite(results[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

106
test/test_expint_float.cu Normal file
View File

@@ -0,0 +1,106 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::expint(in[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
input_vector[i] = dist(rng);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::expint(input_vector[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (std::isfinite(results[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/expint.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/expint.hpp>
extern "C" __global__
void test_expint_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::expint(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_expint_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_expint_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_expint_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::expint(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/expint.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/expint.hpp>
extern "C" __global__
void test_expint_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::expint(in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_expint_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_expint_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_expint_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::expint(h_in1[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,125 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// Gegenbauer prime uses all methods internally so it's the easy choice
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include <boost/math/special_functions/gegenbauer.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::gegenbauer_prime(2, in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results[i] = boost::math::gegenbauer_prime(2, input_vector1[i], input_vector2[i]);
double t = w.elapsed();
// check the results
int failure_counter = 0;
for(int i = 0; i < numElements; ++i)
{
if (std::isfinite(results[i]))
{
const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]);
// Most elements are under 50 but extremely small numbers very more greatly
if (eps > 1000)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i]
<< "\n Host: " << results[i]
<< "\n Eps: " << eps << std::endl;
++failure_counter;
if (failure_counter > 100)
{
break;
}
}
}
}
if (failure_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,124 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// Gegenbauer prime uses all methods internally so it's the easy choice
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include <boost/math/special_functions/gegenbauer.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::gegenbauer_prime(2, in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results[i] = boost::math::gegenbauer_prime(2, input_vector1[i], input_vector2[i]);
double t = w.elapsed();
// check the results
int failure_counter = 0;
for(int i = 0; i < numElements; ++i)
{
if (std::isfinite(results[i]))
{
const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]);
if (eps > 1000)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i]
<< "\n Host: " << results[i]
<< "\n Eps: " << eps << std::endl;
++failure_counter;
if (failure_counter > 100)
{
break;
}
}
}
}
if (failure_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/gegenbauer.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/gegenbauer.hpp>
extern "C" __global__
void test_gegenbauer_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::gegenbauer(2, in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gegenbauer_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_gegenbauer_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_gegenbauer_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::gegenbauer(2, h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/gegenbauer.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/gegenbauer.hpp>
extern "C" __global__
void test_gegenbauer_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::gegenbauer(2, in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gegenbauer_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_gegenbauer_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_gegenbauer_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::gegenbauer(2, h_in1[i], h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -3,9 +3,13 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#endif
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#include <boost/math/tools/config.hpp>
#define BOOST_TEST_MAIN
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
@@ -85,6 +89,7 @@ void test_hankel(T, const char* name)
//
// Instantiate a few instances to check our error handling code can cope with std::complex:
//
#ifndef SYCL_LANGUAGE_VERSION
typedef boost::math::policies::policy<
boost::math::policies::overflow_error<boost::math::policies::throw_on_error>,
boost::math::policies::denorm_error<boost::math::policies::throw_on_error>,
@@ -120,7 +125,7 @@ typedef boost::math::policies::policy<
boost::math::policies::indeterminate_result_error<boost::math::policies::ignore_error> > pol3;
template std::complex<double> boost::math::cyl_hankel_1<double, double, pol3>(double, double, const pol3&);
#endif
BOOST_AUTO_TEST_CASE( test_main )
{

View File

@@ -5,8 +5,11 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#include"test_hermite.hpp"
#endif
#include "test_hermite.hpp"
//
// DESCRIPTION:

View File

@@ -11,11 +11,17 @@
// Constants are too big for float case, but this doesn't matter for test.
#endif
#include <boost/math/tools/config.hpp>
#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
#include <boost/math/concepts/real_concept.hpp>
#endif
#define BOOST_TEST_MAIN
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/hermite.hpp>
#include <boost/math/constants/constants.hpp>
#include <boost/array.hpp>
#include "functor.hpp"

120
test/test_hermite_double.cu Normal file
View File

@@ -0,0 +1,120 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::hermite(1U, in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::hermite(1U, input_vector2[i]));
double t = w.elapsed();
// check the results
int fail_counter = 0;
for(int i = 0; i < numElements; ++i)
{
if (std::isfinite(results[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i] << '\n'
<< " Host: " << results[i] << '\n'
<< " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
fail_counter++;
if (fail_counter > 100)
{
break;
}
}
}
}
if (fail_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

120
test/test_hermite_float.cu Normal file
View File

@@ -0,0 +1,120 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::hermite(1U, in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::hermite(1U, input_vector2[i]));
double t = w.elapsed();
// check the results
int fail_counter = 0;
for(int i = 0; i < numElements; ++i)
{
if (std::isfinite(results[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i] << '\n'
<< " Host: " << results[i] << '\n'
<< " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
fail_counter++;
if (fail_counter > 100)
{
break;
}
}
}
}
if (fail_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/hermite.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/hermite.hpp>
extern "C" __global__
void test_hermite_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::hermite(1U, in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_hermite_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_hermite_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_hermite_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::hermite(1U, h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,190 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/hermite.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/hermite.hpp>
extern "C" __global__
void test_hermite_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::hermite(1U, in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_hermite_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_hermite_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_hermite_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::hermite(1U, h_in2[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,119 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<boost::math::complex<float_type>> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results[i] = boost::math::sph_hankel_1(input_vector1[i], input_vector2[i]);
double t = w.elapsed();
// check the results
int failure_counter = 0;
for(int i = 0; i < numElements; ++i)
{
const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
if (eps > 10)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
<< "\n Host: " << results[i].real() << ", " << results[i].imag()
<< "\n Eps: " << eps << std::endl;
++failure_counter;
if (failure_counter > 100)
{
break;
}
}
}
if (failure_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,119 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<boost::math::complex<float_type>> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results[i] = boost::math::sph_hankel_1(input_vector1[i], input_vector2[i]);
double t = w.elapsed();
// check the results
int failure_counter = 0;
for(int i = 0; i < numElements; ++i)
{
const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
if (eps > 10)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
<< "\n Host: " << results[i].real() << ", " << results[i].imag()
<< "\n Eps: " << eps << std::endl;
++failure_counter;
if (failure_counter > 100)
{
break;
}
}
}
if (failure_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,199 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions/hankel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/hankel.hpp>
extern "C" __global__
void test_sph_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_1_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_sph_hankel_1_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_1_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2;
float_type *d_in1, *d_in2;
boost::math::complex<float_type> *h_out, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new boost::math::complex<float_type>[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
int fail_counter = 0;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::sph_hankel_1(h_in1[i], h_in2[i]);
if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
<< "\n Serial: " << res.real() << ", " << res.imag()
<< "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
++fail_counter;
if (fail_counter > 100)
{
break;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
if (fail_counter > 0)
{
return 1;
}
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,199 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions/hankel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/hankel.hpp>
extern "C" __global__
void test_sph_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_1_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_sph_hankel_1_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_1_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2;
float_type *d_in1, *d_in2;
boost::math::complex<float_type> *h_out, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new boost::math::complex<float_type>[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
int fail_counter = 0;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::sph_hankel_1(h_in1[i], h_in2[i]);
if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
<< "\n Serial: " << res.real() << ", " << res.imag()
<< "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
++fail_counter;
if (fail_counter > 100)
{
break;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
if (fail_counter > 0)
{
return 1;
}
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,119 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<boost::math::complex<float_type>> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results[i] = boost::math::sph_hankel_2(input_vector1[i], input_vector2[i]);
double t = w.elapsed();
// check the results
int failure_counter = 0;
for(int i = 0; i < numElements; ++i)
{
const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
if (eps > 10)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
<< "\n Host: " << results[i].real() << ", " << results[i].imag()
<< "\n Eps: " << eps << std::endl;
++failure_counter;
if (failure_counter > 100)
{
break;
}
}
}
if (failure_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,119 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<boost::math::complex<float_type>> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results[i] = boost::math::sph_hankel_2(input_vector1[i], input_vector2[i]);
double t = w.elapsed();
// check the results
int failure_counter = 0;
for(int i = 0; i < numElements; ++i)
{
const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
if (eps > 10)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
<< "\n Host: " << results[i].real() << ", " << results[i].imag()
<< "\n Eps: " << eps << std::endl;
++failure_counter;
if (failure_counter > 100)
{
break;
}
}
}
if (failure_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}

View File

@@ -0,0 +1,199 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions/hankel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/hankel.hpp>
extern "C" __global__
void test_sph_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_2_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_sph_hankel_2_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_2_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2;
float_type *d_in1, *d_in2;
boost::math::complex<float_type> *h_out, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new boost::math::complex<float_type>[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
int fail_counter = 0;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::sph_hankel_2(h_in1[i], h_in2[i]);
if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
<< "\n Serial: " << res.real() << ", " << res.imag()
<< "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
++fail_counter;
if (fail_counter > 100)
{
break;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
if (fail_counter > 0)
{
return 1;
}
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,199 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/complex.hpp>
#include <boost/math/special_functions/hankel.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/hankel.hpp>
extern "C" __global__
void test_sph_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_2_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_sph_hankel_2_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_2_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2;
float_type *d_in1, *d_in2;
boost::math::complex<float_type> *h_out, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new boost::math::complex<float_type>[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
int fail_counter = 0;
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::sph_hankel_2(h_in1[i], h_in2[i]);
if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
<< "\n Serial: " << res.real() << ", " << res.imag()
<< "\n Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
++fail_counter;
if (fail_counter > 100)
{
break;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
if (fail_counter > 0)
{
return 1;
}
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}