From 047c206c3028331d353e20a13855f8fafb0ccd5e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 20 Aug 2024 16:40:47 -0400 Subject: [PATCH] Add bessel GPU support Add GPU support to bessel_i0 Add CUDA and NVRTC testing Add GPU support to bessel_i1 Add CUDA and NVRTC testing of bessel_i1 Add tgamma1pm1 NVRTC impl Add GPU support to iconv Add GPU test to bessel_ik Add SYCL testing of complete bessel_i Add GPU support to bessel_j0 Ignore BOOST_MATH_INSTRUMENT_VARIABLE on NVRTC Add bessel J0 CUDA and NVRTC testing Add GPU support to bessel_j1 Add bessel j1 CUDA and NVRTC testing Add GPU support to bessel jn and jy Add SYCL bessel j testing Add bessel_k0 GPU support Add bessel_k0 CUDA and NVRTC testing Add GPU support to bessel_k1 Add bessel_k1 CUDA and NVRTC testing Add GPU support to bessel_kn Add bessel_kn CUDA and NVRTC testing Add SYCL testing of complete bessel_k Make newton-rhapson GPU compatible Make the completed bessel functions GPU compatible Add SYCL bessel y testing Apply changes for non-empty policy on CUDA Add NVCC cyl_bessel_i testing Add GPU support to sinc Add GPU support to series functions Add GPU support to bessel_jy_zero Add array helper type Make hypot GPU safe Make bessel_yX GPU capable Add bessel_y0 and bessel_y1 CUDA testing Add nvrtc testing of bessel_y0 and bessel_y1 Fix macros Add missing header Add missing header Markup iconv Add iround for NVRTC Add tgamma1pm1 with policy overload for NVRTC Disable header Fix factorial support for CUDA platforms Add definition of bessel traits Add cyl_bessel_i NVRTC testing Fix cyl_bessel_jy warnings Fix CUDA forward declarations Fix maybe-unused variable warning Add CUDA cyl_bessel_j testing Add sign overload for lgamma Fix warnings Add NVRTC cyl_bessel_j testing Add NVCC sph_bessel testing Add NVRTC testing of sph_bessel Add NVRTC testing of cyl_bessel_k Add NVCC testing of cyl_bessel_k Add NVCC testing of cyl_neumann Add NVRTC cyl_neumann testing Add NVRTC sph_neumann testing Add NVCC sph_neumann testing --- .../boost/math/special_functions/bessel.hpp | 257 ++++++++++++------ .../detail/airy_ai_bi_zero.hpp | 44 ++- .../special_functions/detail/bessel_i0.hpp | 78 +++--- .../special_functions/detail/bessel_i1.hpp | 73 ++--- .../special_functions/detail/bessel_ik.hpp | 32 ++- .../special_functions/detail/bessel_j0.hpp | 34 +-- .../special_functions/detail/bessel_j1.hpp | 46 ++-- .../special_functions/detail/bessel_jn.hpp | 6 +- .../special_functions/detail/bessel_jy.hpp | 44 +-- .../detail/bessel_jy_asym.hpp | 21 +- .../detail/bessel_jy_series.hpp | 27 +- .../detail/bessel_jy_zero.hpp | 86 +++--- .../special_functions/detail/bessel_k0.hpp | 114 ++++---- .../special_functions/detail/bessel_k1.hpp | 126 +++++---- .../special_functions/detail/bessel_kn.hpp | 8 +- .../special_functions/detail/bessel_y0.hpp | 27 +- .../special_functions/detail/bessel_y1.hpp | 23 +- .../special_functions/detail/bessel_yn.hpp | 6 +- .../math/special_functions/detail/iconv.hpp | 11 +- .../detail/unchecked_factorial.hpp | 96 ++++--- .../boost/math/special_functions/expm1.hpp | 54 ++-- .../math/special_functions/factorials.hpp | 31 ++- .../boost/math/special_functions/gamma.hpp | 43 ++- .../boost/math/special_functions/hypot.hpp | 20 +- .../boost/math/special_functions/math_fwd.hpp | 118 ++++---- include/boost/math/special_functions/next.hpp | 7 + .../boost/math/special_functions/round.hpp | 24 ++ include/boost/math/special_functions/sinc.hpp | 29 +- include/boost/math/special_functions/ulp.hpp | 1 + include/boost/math/tools/array.hpp | 41 +++ include/boost/math/tools/config.hpp | 9 +- include/boost/math/tools/roots.hpp | 94 ++++--- include/boost/math/tools/series.hpp | 70 +++-- test/cuda_jamfile | 71 ++++- test/nvrtc_jamfile | 45 +++ test/sycl_jamfile | 4 + test/test_bessel_i.cpp | 14 + test/test_bessel_i.hpp | 6 + test/test_bessel_i0_double.cu | 100 +++++++ test/test_bessel_i0_float.cu | 100 +++++++ test/test_bessel_i0_nvrtc_double.cpp | 190 +++++++++++++ test/test_bessel_i0_nvrtc_float.cpp | 190 +++++++++++++ test/test_bessel_i1_double.cu | 100 +++++++ test/test_bessel_i1_float.cu | 100 +++++++ test/test_bessel_i1_nvrtc_double.cpp | 190 +++++++++++++ test/test_bessel_i1_nvrtc_float.cpp | 190 +++++++++++++ test/test_bessel_j.cpp | 5 + test/test_bessel_j.hpp | 3 + test/test_bessel_j0_double.cu | 100 +++++++ test/test_bessel_j0_float.cu | 100 +++++++ test/test_bessel_j0_nvrtc_double.cpp | 190 +++++++++++++ test/test_bessel_j0_nvrtc_float.cpp | 190 +++++++++++++ test/test_bessel_j1_double.cu | 100 +++++++ test/test_bessel_j1_float.cu | 100 +++++++ test/test_bessel_j1_nvrtc_double.cpp | 190 +++++++++++++ test/test_bessel_j1_nvrtc_float.cpp | 190 +++++++++++++ test/test_bessel_k.cpp | 5 + test/test_bessel_k.hpp | 3 + test/test_bessel_k0_double.cu | 100 +++++++ test/test_bessel_k0_float.cu | 100 +++++++ test/test_bessel_k0_nvrtc_double.cpp | 190 +++++++++++++ test/test_bessel_k0_nvrtc_float.cpp | 190 +++++++++++++ test/test_bessel_k1_double.cu | 100 +++++++ test/test_bessel_k1_float.cu | 100 +++++++ test/test_bessel_k1_nvrtc_double.cpp | 190 +++++++++++++ test/test_bessel_k1_nvrtc_float.cpp | 190 +++++++++++++ test/test_bessel_kn_double.cu | 105 +++++++ test/test_bessel_kn_float.cu | 105 +++++++ test/test_bessel_kn_nvrtc_double.cpp | 192 +++++++++++++ test/test_bessel_kn_nvrtc_float.cpp | 192 +++++++++++++ test/test_bessel_y.cpp | 9 + test/test_bessel_y.hpp | 3 + test/test_bessel_y0_double.cu | 106 ++++++++ test/test_bessel_y0_float.cu | 106 ++++++++ test/test_bessel_y0_nvrtc_double.cpp | 194 +++++++++++++ test/test_bessel_y0_nvrtc_float.cpp | 194 +++++++++++++ test/test_bessel_y1_double.cu | 106 ++++++++ test/test_bessel_y1_float.cu | 106 ++++++++ test/test_bessel_y1_nvrtc_double.cpp | 194 +++++++++++++ test/test_bessel_y1_nvrtc_float.cpp | 194 +++++++++++++ test/test_cyl_bessel_i_double.cu | 104 +++++++ test/test_cyl_bessel_i_float.cu | 104 +++++++ test/test_cyl_bessel_i_nvrtc_double.cpp | 190 +++++++++++++ test/test_cyl_bessel_i_nvrtc_float.cpp | 190 +++++++++++++ test/test_cyl_bessel_j_double.cu | 104 +++++++ test/test_cyl_bessel_j_float.cu | 104 +++++++ test/test_cyl_bessel_j_nvrtc_double.cpp | 190 +++++++++++++ test/test_cyl_bessel_j_nvrtc_float.cpp | 190 +++++++++++++ test/test_cyl_bessel_k_double.cu | 104 +++++++ test/test_cyl_bessel_k_float.cu | 104 +++++++ test/test_cyl_bessel_k_nvrtc_double.cpp | 190 +++++++++++++ test/test_cyl_bessel_k_nvrtc_float.cpp | 190 +++++++++++++ test/test_cyl_neumann_double.cu | 116 ++++++++ test/test_cyl_neumann_float.cu | 104 +++++++ test/test_cyl_neumann_nvrtc_double.cpp | 190 +++++++++++++ test/test_cyl_neumann_nvrtc_float.cpp | 190 +++++++++++++ test/test_sph_bessel_double.cu | 119 ++++++++ test/test_sph_bessel_float.cu | 119 ++++++++ test/test_sph_bessel_nvrtc_double.cpp | 199 ++++++++++++++ test/test_sph_bessel_nvrtc_float.cpp | 199 ++++++++++++++ test/test_sph_neumann_double.cu | 116 ++++++++ test/test_sph_neumann_float.cu | 116 ++++++++ test/test_sph_neumann_nvrtc_double.cpp | 190 +++++++++++++ test/test_sph_neumann_nvrtc_float.cpp | 190 +++++++++++++ 104 files changed, 10079 insertions(+), 675 deletions(-) create mode 100644 include/boost/math/tools/array.hpp create mode 100644 test/test_bessel_i0_double.cu create mode 100644 test/test_bessel_i0_float.cu create mode 100644 test/test_bessel_i0_nvrtc_double.cpp create mode 100644 test/test_bessel_i0_nvrtc_float.cpp create mode 100644 test/test_bessel_i1_double.cu create mode 100644 test/test_bessel_i1_float.cu create mode 100644 test/test_bessel_i1_nvrtc_double.cpp create mode 100644 test/test_bessel_i1_nvrtc_float.cpp create mode 100644 test/test_bessel_j0_double.cu create mode 100644 test/test_bessel_j0_float.cu create mode 100644 test/test_bessel_j0_nvrtc_double.cpp create mode 100644 test/test_bessel_j0_nvrtc_float.cpp create mode 100644 test/test_bessel_j1_double.cu create mode 100644 test/test_bessel_j1_float.cu create mode 100644 test/test_bessel_j1_nvrtc_double.cpp create mode 100644 test/test_bessel_j1_nvrtc_float.cpp create mode 100644 test/test_bessel_k0_double.cu create mode 100644 test/test_bessel_k0_float.cu create mode 100644 test/test_bessel_k0_nvrtc_double.cpp create mode 100644 test/test_bessel_k0_nvrtc_float.cpp create mode 100644 test/test_bessel_k1_double.cu create mode 100644 test/test_bessel_k1_float.cu create mode 100644 test/test_bessel_k1_nvrtc_double.cpp create mode 100644 test/test_bessel_k1_nvrtc_float.cpp create mode 100644 test/test_bessel_kn_double.cu create mode 100644 test/test_bessel_kn_float.cu create mode 100644 test/test_bessel_kn_nvrtc_double.cpp create mode 100644 test/test_bessel_kn_nvrtc_float.cpp create mode 100644 test/test_bessel_y0_double.cu create mode 100644 test/test_bessel_y0_float.cu create mode 100644 test/test_bessel_y0_nvrtc_double.cpp create mode 100644 test/test_bessel_y0_nvrtc_float.cpp create mode 100644 test/test_bessel_y1_double.cu create mode 100644 test/test_bessel_y1_float.cu create mode 100644 test/test_bessel_y1_nvrtc_double.cpp create mode 100644 test/test_bessel_y1_nvrtc_float.cpp create mode 100644 test/test_cyl_bessel_i_double.cu create mode 100644 test/test_cyl_bessel_i_float.cu create mode 100644 test/test_cyl_bessel_i_nvrtc_double.cpp create mode 100644 test/test_cyl_bessel_i_nvrtc_float.cpp create mode 100644 test/test_cyl_bessel_j_double.cu create mode 100644 test/test_cyl_bessel_j_float.cu create mode 100644 test/test_cyl_bessel_j_nvrtc_double.cpp create mode 100644 test/test_cyl_bessel_j_nvrtc_float.cpp create mode 100644 test/test_cyl_bessel_k_double.cu create mode 100644 test/test_cyl_bessel_k_float.cu create mode 100644 test/test_cyl_bessel_k_nvrtc_double.cpp create mode 100644 test/test_cyl_bessel_k_nvrtc_float.cpp create mode 100644 test/test_cyl_neumann_double.cu create mode 100644 test/test_cyl_neumann_float.cu create mode 100644 test/test_cyl_neumann_nvrtc_double.cpp create mode 100644 test/test_cyl_neumann_nvrtc_float.cpp create mode 100644 test/test_sph_bessel_double.cu create mode 100644 test/test_sph_bessel_float.cu create mode 100644 test/test_sph_bessel_nvrtc_double.cpp create mode 100644 test/test_sph_bessel_nvrtc_float.cpp create mode 100644 test/test_sph_neumann_double.cu create mode 100644 test/test_sph_neumann_float.cu create mode 100644 test/test_sph_neumann_nvrtc_double.cpp create mode 100644 test/test_sph_neumann_nvrtc_float.cpp diff --git a/include/boost/math/special_functions/bessel.hpp b/include/boost/math/special_functions/bessel.hpp index e9677d3c7..081473442 100644 --- a/include/boost/math/special_functions/bessel.hpp +++ b/include/boost/math/special_functions/bessel.hpp @@ -15,8 +15,14 @@ # pragma once #endif -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include @@ -31,10 +37,8 @@ #include #include #include -#include -#include -#include -#include +#include +#include #ifdef _MSC_VER # pragma warning(push) @@ -43,6 +47,50 @@ namespace boost{ namespace math{ +// Since we cannot pull this in from math fwd we need a copy +#ifdef BOOST_MATH_HAS_NVRTC + +namespace detail{ + + typedef boost::math::integral_constant bessel_no_int_tag; // No integer optimisation possible. + typedef boost::math::integral_constant bessel_maybe_int_tag; // Maybe integer optimisation. + typedef boost::math::integral_constant bessel_int_tag; // Definite integer optimisation. + + template + struct bessel_traits + { + using result_type = typename boost::math::conditional< + boost::math::is_integral::value, + typename tools::promote_args::type, + tools::promote_args_t + >::type; + + typedef typename policies::precision::type precision_type; + + using optimisation_tag = typename boost::math::conditional< + (precision_type::value <= 0 || precision_type::value > 64), + bessel_no_int_tag, + typename boost::math::conditional< + boost::math::is_integral::value, + bessel_int_tag, + bessel_maybe_int_tag + >::type + >::type; + + using optimisation_tag128 = typename boost::math::conditional< + (precision_type::value <= 0 || precision_type::value > 113), + bessel_no_int_tag, + typename boost::math::conditional< + boost::math::is_integral::value, + bessel_int_tag, + bessel_maybe_int_tag + >::type + >::type; + }; + } // detail + +#endif + namespace detail{ template @@ -50,7 +98,7 @@ struct sph_bessel_j_small_z_series_term { typedef T result_type; - sph_bessel_j_small_z_series_term(unsigned v_, T x) + BOOST_MATH_GPU_ENABLED sph_bessel_j_small_z_series_term(unsigned v_, T x) : N(0), v(v_) { BOOST_MATH_STD_USING @@ -64,7 +112,7 @@ struct sph_bessel_j_small_z_series_term term = pow(mult, T(v)) / boost::math::tgamma(v+1+T(0.5f), Policy()); mult *= -mult; } - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T r = term; ++N; @@ -79,11 +127,11 @@ private: }; template -inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names sph_bessel_j_small_z_series_term s(v, x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon(), max_iter); @@ -92,10 +140,21 @@ inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol) } template -T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol) +BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp_final(T v, T x, const bessel_no_int_tag& t, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "boost::math::bessel_j<%1%>(%1%,%1%)"; + + T result_J, y; // LCOV_EXCL_LINE + bessel_jy(v, x, &result_J, &y, need_j, pol); + return result_J; +} + +// Dispatch funtion to avoid recursion +template +BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol) +{ + BOOST_MATH_STD_USING + if(x < 0) { // better have integer v: @@ -105,23 +164,27 @@ T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol) // This branch is hit by multiprecision types only, and is // tested by our real_concept tests, but thee are excluded from coverage // due to time constraints. - T r = cyl_bessel_j_imp(v, T(-x), t, pol); + T r = cyl_bessel_j_imp_final(T(v), T(-x), t, pol); if (iround(v, pol) & 1) + { r = -r; + } + return r; // LCOV_EXCL_STOP } else + { + constexpr auto function = "boost::math::bessel_j<%1%>(%1%,%1%)"; return policies::raise_domain_error(function, "Got x = %1%, but we need x >= 0", x, pol); + } } - T result_J, y; // LCOV_EXCL_LINE - bessel_jy(v, x, &result_J, &y, need_j, pol); - return result_J; + return cyl_bessel_j_imp_final(T(v), T(x), t, pol); } template -inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names. int ival = detail::iconv(v, pol); @@ -135,14 +198,14 @@ inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& p } template -inline T cyl_bessel_j_imp(int v, T x, const bessel_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_imp(int v, T x, const bessel_int_tag&, const Policy& pol) { BOOST_MATH_STD_USING return bessel_jn(v, x, pol); } template -inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names if(x < 0) @@ -171,7 +234,7 @@ inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol) } template -T cyl_bessel_i_imp(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T cyl_bessel_i_imp_final(T v, T x, const Policy& pol) { // // This handles all the bessel I functions, note that we don't optimise @@ -180,20 +243,7 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol) // case has better error handling too). // BOOST_MATH_STD_USING - static const char* function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)"; - if(x < 0) - { - // better have integer v: - if(floor(v) == v) - { - T r = cyl_bessel_i_imp(v, T(-x), pol); - if(iround(v, pol) & 1) - r = -r; - return r; - } - else - return policies::raise_domain_error(function, "Got x = %1%, but we need x >= 0", x, pol); - } + constexpr auto function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)"; if(x == 0) { if(v < 0) @@ -210,7 +260,7 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol) } return sqrt(2 / (x * constants::pi())) * sinh(x); } - if((policies::digits() <= 113) && (std::numeric_limits::digits <= 113) && (std::numeric_limits::radix == 2)) + if((policies::digits() <= 113) && (boost::math::numeric_limits::digits <= 113) && (boost::math::numeric_limits::radix == 2)) { if(v == 0) { @@ -228,10 +278,39 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol) return result_I; } +// Additional dispatch function to get the GPU impls happy template -inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Policy& pol) +BOOST_MATH_GPU_ENABLED T cyl_bessel_i_imp(T v, T x, const Policy& pol) { - static const char* function = "boost::math::cyl_bessel_k<%1%>(%1%,%1%)"; + BOOST_MATH_STD_USING + constexpr auto function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)"; + + if(x < 0) + { + // better have integer v: + if(floor(v) == v) + { + T r = cyl_bessel_i_imp_final(T(v), T(-x), pol); + if(iround(v, pol) & 1) + { + r = -r; + } + + return r; + } + else + { + return policies::raise_domain_error(function, "Got x = %1%, but we need x >= 0", x, pol); + } + } + + return cyl_bessel_i_imp_final(T(v), T(x), pol); +} + +template +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Policy& pol) +{ + constexpr auto function = "boost::math::cyl_bessel_k<%1%>(%1%,%1%)"; BOOST_MATH_STD_USING if(x < 0) { @@ -248,7 +327,7 @@ inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Poli } template -inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) { BOOST_MATH_STD_USING if((floor(v) == v)) @@ -259,15 +338,15 @@ inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& p } template -inline T cyl_bessel_k_imp(int v, T x, const bessel_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(int v, T x, const bessel_int_tag&, const Policy& pol) { return bessel_kn(v, x, pol); } template -inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol) { - static const char* function = "boost::math::cyl_neumann<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::cyl_neumann<%1%>(%1%,%1%)"; BOOST_MATH_INSTRUMENT_VARIABLE(v); BOOST_MATH_INSTRUMENT_VARIABLE(x); @@ -291,7 +370,7 @@ inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol) } template -inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol) { BOOST_MATH_STD_USING @@ -310,16 +389,16 @@ inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& po } template -inline T cyl_neumann_imp(int v, T x, const bessel_int_tag&, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(int v, T x, const bessel_int_tag&, const Policy& pol) { return bessel_yn(v, x, pol); } template -inline T sph_neumann_imp(unsigned v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T sph_neumann_imp(unsigned v, T x, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names - static const char* function = "boost::math::sph_neumann<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::sph_neumann<%1%>(%1%,%1%)"; // // Nothing much to do here but check for errors, and // evaluate the function's definition directly: @@ -340,11 +419,11 @@ inline T sph_neumann_imp(unsigned v, T x, const Policy& pol) } template -inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for floor. - static const char* function = "boost::math::cyl_bessel_j_zero<%1%>(%1%, int)"; + constexpr auto function = "boost::math::cyl_bessel_j_zero<%1%>(%1%, int)"; const T half_epsilon(boost::math::tools::epsilon() / 2U); @@ -395,7 +474,7 @@ inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol) const T guess_root = boost::math::detail::bessel_zero::cyl_bessel_j_zero_detail::initial_guess((order_is_integer ? vv : v), m, pol); // Select the maximum allowed iterations from the policy. - std::uintmax_t number_of_iterations = policies::get_max_root_iterations(); + boost::math::uintmax_t number_of_iterations = policies::get_max_root_iterations(); const T delta_lo = ((guess_root > 0.2F) ? T(0.2) : T(guess_root / 2U)); @@ -418,11 +497,11 @@ inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol) } template -inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for floor. - static const char* function = "boost::math::cyl_neumann_zero<%1%>(%1%, int)"; + constexpr auto function = "boost::math::cyl_neumann_zero<%1%>(%1%, int)"; // Handle non-finite order. if (!(boost::math::isfinite)(v) ) @@ -473,7 +552,7 @@ inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol) const T guess_root = boost::math::detail::bessel_zero::cyl_neumann_zero_detail::initial_guess(v, m, pol); // Select the maximum allowed iterations from the policy. - std::uintmax_t number_of_iterations = policies::get_max_root_iterations(); + boost::math::uintmax_t number_of_iterations = policies::get_max_root_iterations(); const T delta_lo = ((guess_root > 0.2F) ? T(0.2) : T(guess_root / 2U)); @@ -498,7 +577,7 @@ inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol) } // namespace detail template -inline typename detail::bessel_traits::result_type cyl_bessel_j(T1 v, T2 x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_bessel_j(T1 v, T2 x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -514,13 +593,13 @@ inline typename detail::bessel_traits::result_type cyl_bessel_j( } template -inline typename detail::bessel_traits >::result_type cyl_bessel_j(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_bessel_j(T1 v, T2 x) { return cyl_bessel_j(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type sph_bessel(unsigned v, T x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type sph_bessel(unsigned v, T x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -535,13 +614,13 @@ inline typename detail::bessel_traits::result_type sph_bessel(unsi } template -inline typename detail::bessel_traits >::result_type sph_bessel(unsigned v, T x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type sph_bessel(unsigned v, T x) { return sph_bessel(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type cyl_bessel_i(T1 v, T2 x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_bessel_i(T1 v, T2 x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -556,13 +635,13 @@ inline typename detail::bessel_traits::result_type cyl_bessel_i( } template -inline typename detail::bessel_traits >::result_type cyl_bessel_i(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_bessel_i(T1 v, T2 x) { return cyl_bessel_i(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type cyl_bessel_k(T1 v, T2 x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_bessel_k(T1 v, T2 x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -578,13 +657,13 @@ inline typename detail::bessel_traits::result_type cyl_bessel_k( } template -inline typename detail::bessel_traits >::result_type cyl_bessel_k(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_bessel_k(T1 v, T2 x) { return cyl_bessel_k(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type cyl_neumann(T1 v, T2 x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_neumann(T1 v, T2 x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -600,13 +679,13 @@ inline typename detail::bessel_traits::result_type cyl_neumann(T } template -inline typename detail::bessel_traits >::result_type cyl_neumann(T1 v, T2 x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_neumann(T1 v, T2 x) { return cyl_neumann(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type sph_neumann(unsigned v, T x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type sph_neumann(unsigned v, T x, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -621,13 +700,13 @@ inline typename detail::bessel_traits::result_type sph_neumann(uns } template -inline typename detail::bessel_traits >::result_type sph_neumann(unsigned v, T x) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type sph_neumann(unsigned v, T x) { return sph_neumann(v, x, policies::policy<>()); } template -inline typename detail::bessel_traits::result_type cyl_bessel_j_zero(T v, int m, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_bessel_j_zero(T v, int m, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -639,35 +718,35 @@ inline typename detail::bessel_traits::result_type cyl_bessel_j_ze policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); return policies::checked_narrowing_cast(detail::cyl_bessel_j_zero_imp(v, m, forwarding_policy()), "boost::math::cyl_bessel_j_zero<%1%>(%1%,%1%)"); } template -inline typename detail::bessel_traits >::result_type cyl_bessel_j_zero(T v, int m) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_bessel_j_zero(T v, int m) { - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); return cyl_bessel_j_zero >(v, m, policies::policy<>()); } template -inline OutputIterator cyl_bessel_j_zero(T v, +BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_bessel_j_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it, const Policy& pol) { - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); for(int i = 0; i < static_cast(number_of_zeros); ++i) @@ -679,7 +758,7 @@ inline OutputIterator cyl_bessel_j_zero(T v, } template -inline OutputIterator cyl_bessel_j_zero(T v, +BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_bessel_j_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it) @@ -688,7 +767,7 @@ inline OutputIterator cyl_bessel_j_zero(T v, } template -inline typename detail::bessel_traits::result_type cyl_neumann_zero(T v, int m, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits::result_type cyl_neumann_zero(T v, int m, const Policy& /* pol */) { BOOST_FPU_EXCEPTION_GUARD typedef typename detail::bessel_traits::result_type result_type; @@ -700,35 +779,35 @@ inline typename detail::bessel_traits::result_type cyl_neumann_zer policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); return policies::checked_narrowing_cast(detail::cyl_neumann_zero_imp(v, m, forwarding_policy()), "boost::math::cyl_neumann_zero<%1%>(%1%,%1%)"); } template -inline typename detail::bessel_traits >::result_type cyl_neumann_zero(T v, int m) +BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits >::result_type cyl_neumann_zero(T v, int m) { - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); return cyl_neumann_zero >(v, m, policies::policy<>()); } template -inline OutputIterator cyl_neumann_zero(T v, +BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_neumann_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it, const Policy& pol) { - static_assert( false == std::numeric_limits::is_specialized - || ( true == std::numeric_limits::is_specialized - && false == std::numeric_limits::is_integer), + static_assert( false == boost::math::numeric_limits::is_specialized + || ( true == boost::math::numeric_limits::is_specialized + && false == boost::math::numeric_limits::is_integer), "Order must be a floating-point type."); for(int i = 0; i < static_cast(number_of_zeros); ++i) @@ -740,7 +819,7 @@ inline OutputIterator cyl_neumann_zero(T v, } template -inline OutputIterator cyl_neumann_zero(T v, +BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_neumann_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it) diff --git a/include/boost/math/special_functions/detail/airy_ai_bi_zero.hpp b/include/boost/math/special_functions/detail/airy_ai_bi_zero.hpp index 7735eb858..e518422f1 100644 --- a/include/boost/math/special_functions/detail/airy_ai_bi_zero.hpp +++ b/include/boost/math/special_functions/detail/airy_ai_bi_zero.hpp @@ -13,6 +13,8 @@ #ifndef BOOST_MATH_AIRY_AI_BI_ZERO_2013_01_20_HPP_ #define BOOST_MATH_AIRY_AI_BI_ZERO_2013_01_20_HPP_ + #include + #include #include #include @@ -21,18 +23,18 @@ { // Forward declarations of the needed Airy function implementations. template - T airy_ai_imp(T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED T airy_ai_imp(T x, const Policy& pol); template - T airy_bi_imp(T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED T airy_bi_imp(T x, const Policy& pol); template - T airy_ai_prime_imp(T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED T airy_ai_prime_imp(T x, const Policy& pol); template - T airy_bi_prime_imp(T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED T airy_bi_prime_imp(T x, const Policy& pol); namespace airy_zero { template - T equation_as_10_4_105(const T& z, const Policy& pol) + BOOST_MATH_GPU_ENABLED T equation_as_10_4_105(const T& z, const Policy& pol) { const T one_over_z (T(1) / z); const T one_over_z_squared(one_over_z * one_over_z); @@ -54,7 +56,7 @@ namespace airy_ai_zero_detail { template - T initial_guess(const int m, const Policy& pol) + BOOST_MATH_GPU_ENABLED T initial_guess(const int m, const Policy& pol) { T guess; @@ -106,11 +108,19 @@ class function_object_ai_and_ai_prime { public: - explicit function_object_ai_and_ai_prime(const Policy& pol) : my_pol(pol) { } + BOOST_MATH_GPU_ENABLED explicit function_object_ai_and_ai_prime(const Policy& pol) : my_pol(pol) { } - function_object_ai_and_ai_prime(const function_object_ai_and_ai_prime&) = default; + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_suppress 20012 + #endif - boost::math::tuple operator()(const T& x) const + BOOST_MATH_GPU_ENABLED function_object_ai_and_ai_prime(const function_object_ai_and_ai_prime&) = default; + + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_default 20012 + #endif + + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& x) const { // Return a tuple containing both Ai(x) and Ai'(x). return boost::math::make_tuple( @@ -127,7 +137,7 @@ namespace airy_bi_zero_detail { template - T initial_guess(const int m, const Policy& pol) + BOOST_MATH_GPU_ENABLED T initial_guess(const int m, const Policy& pol) { T guess; @@ -179,11 +189,19 @@ class function_object_bi_and_bi_prime { public: - explicit function_object_bi_and_bi_prime(const Policy& pol) : my_pol(pol) { } + BOOST_MATH_GPU_ENABLED explicit function_object_bi_and_bi_prime(const Policy& pol) : my_pol(pol) { } - function_object_bi_and_bi_prime(const function_object_bi_and_bi_prime&) = default; + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_suppress 20012 + #endif + + BOOST_MATH_GPU_ENABLED function_object_bi_and_bi_prime(const function_object_bi_and_bi_prime&) = default; + + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_default 20012 + #endif - boost::math::tuple operator()(const T& x) const + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& x) const { // Return a tuple containing both Bi(x) and Bi'(x). return boost::math::make_tuple( diff --git a/include/boost/math/special_functions/detail/bessel_i0.hpp b/include/boost/math/special_functions/detail/bessel_i0.hpp index af6e8c379..f2219cc94 100644 --- a/include/boost/math/special_functions/detail/bessel_i0.hpp +++ b/include/boost/math/special_functions/detail/bessel_i0.hpp @@ -1,5 +1,6 @@ // Copyright (c) 2006 Xiaogang Zhang // Copyright (c) 2017 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -14,6 +15,9 @@ #include #include #include +#include +#include +#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -35,24 +39,24 @@ namespace boost { namespace math { namespace detail{ template -T bessel_i0(const T& x); +BOOST_MATH_GPU_ENABLED T bessel_i0(const T& x); template -T bessel_i0_imp(const T&, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T&, const boost::math::integral_constant&) { BOOST_MATH_ASSERT(0); return 0; } template -T bessel_i0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) { // Max error in interpolated form: 3.929e-08 // Max Error found at float precision = Poly: 1.991226e-07 - static const float P[] = { + BOOST_MATH_STATIC const float P[] = { 1.00000003928615375e+00f, 2.49999576572179639e-01f, 2.77785268558399407e-02f, @@ -70,7 +74,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form: 5.195e-08 // Max Error found at float precision = Poly: 8.502534e-08 - static const float P[] = { + BOOST_MATH_STATIC const float P[] = { 3.98942651588301770e-01f, 4.98327234176892844e-02f, 2.91866904423115499e-02f, @@ -83,7 +87,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form: 1.782e-09 // Max Error found at float precision = Poly: 6.473568e-08 - static const float P[] = { + BOOST_MATH_STATIC const float P[] = { 3.98942391532752700e-01f, 4.98455950638200020e-02f, 2.94835666900682535e-02f @@ -96,7 +100,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) } template -T bessel_i0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -104,7 +108,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Bessel I0 over[10 ^ -16, 7.75] // Max error in interpolated form : 3.042e-18 // Max Error found at double precision = Poly : 5.106609e-16 Cheb : 5.239199e-16 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 1.00000000000000000e+00, 2.49999999999999909e-01, 2.77777777777782257e-02, @@ -128,7 +132,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form : 1.685e-16 // Max Error found at double precision = Poly : 2.575063e-16 Cheb : 2.247615e+00 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 3.98942280401425088e-01, 4.98677850604961985e-02, 2.80506233928312623e-02, @@ -158,7 +162,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form : 2.437e-18 // Max Error found at double precision = Poly : 1.216719e-16 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 3.98942280401432905e-01, 4.98677850491434560e-02, 2.80506308916506102e-02, @@ -173,7 +177,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) } template -T bessel_i0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -182,7 +186,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 3.899e-20 // Max Error found at float80 precision = Poly : 1.770840e-19 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 9.99999999999999999961011629e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 2.50000000000000001321873912e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 2.77777777777777703400424216e-02), @@ -211,8 +215,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 1.631e-04 // Max Error found at float80 precision = Poly : 7.811948e-21 // LCOV_EXCL_START - static const T Y = 4.051098823547363281250e-01f; - static const T P[] = { + BOOST_MATH_STATIC const T Y = 4.051098823547363281250e-01f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -6.158081780620616479492e-03), BOOST_MATH_BIG_CONSTANT(T, 64, 4.883635969834048766148e-02), BOOST_MATH_BIG_CONSTANT(T, 64, 7.892782002476195771920e-02), @@ -237,8 +241,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 1.304e-03 // Max Error found at float80 precision = Poly : 2.303527e-20 // LCOV_EXCL_START - static const T Y = 4.033188819885253906250e-01f; - static const T P[] = { + BOOST_MATH_STATIC const T Y = 4.033188819885253906250e-01f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -4.376373876116109401062e-03), BOOST_MATH_BIG_CONSTANT(T, 64, 4.982899138682911273321e-02), BOOST_MATH_BIG_CONSTANT(T, 64, 3.109477529533515397644e-02), @@ -262,8 +266,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 1.035e-21 // Max Error found at float80 precision = Poly: 1.885872e-21 // LCOV_EXCL_START - static const T Y = 4.011702537536621093750e-01f; - static const T P[] = { + BOOST_MATH_STATIC const T Y = 4.011702537536621093750e-01f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -2.227973351806078464328e-03), BOOST_MATH_BIG_CONSTANT(T, 64, 4.986778486088017419036e-02), BOOST_MATH_BIG_CONSTANT(T, 64, 2.805066823812285310011e-02), @@ -291,7 +295,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 5.587e-20 // Max Error found at float80 precision = Poly : 8.776852e-20 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401432677955074061e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 4.98677850501789875615574058e-02), BOOST_MATH_BIG_CONSTANT(T, 64, 2.80506290908675604202206833e-02), @@ -320,7 +324,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) } template -T bessel_i0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -329,7 +333,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 1.274e-34 // Max Error found at float128 precision = Poly : 3.096091e-34 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.0000000000000000000000000000000001273856e+00), BOOST_MATH_BIG_CONSTANT(T, 113, 2.4999999999999999999999999999999107477496e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.7777777777777777777777777777881795230918e-02), @@ -364,7 +368,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 7.534e-35 // Max Error found at float128 precision = Poly : 6.123912e-34 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 9.9999999999999999992388573069504617493518e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.5000000000000000007304739268173096975340e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.7777777777777777744261405400543564492074e-02), @@ -403,7 +407,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 1.808e-34 // Max Error found at float128 precision = Poly : 2.399403e-34 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040870793650581242239624530714032e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867780576714783790784348982178607842250e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 2.8051948347934462928487999569249907599510e-02), @@ -445,7 +449,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 1.487e-34 // Max Error found at float128 precision = Poly : 1.929924e-34 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793996798658172135362278e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867785050179084714910130342157246539820e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 2.8050629090725751585266360464766768437048e-02), @@ -480,7 +484,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) // Max error in interpolated form : 5.459e-35 // Max Error found at float128 precision = Poly : 1.472240e-34 // LCOV_EXCL_START - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793994605993438166526772e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867785050179084742493257495245185241487e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 2.8050629090725735167652437695397756897920e-02), @@ -507,33 +511,33 @@ T bessel_i0_imp(const T& x, const std::integral_constant&) } template -T bessel_i0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant&) { if(boost::math::tools::digits() <= 24) - return bessel_i0_imp(x, std::integral_constant()); + return bessel_i0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 53) - return bessel_i0_imp(x, std::integral_constant()); + return bessel_i0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 64) - return bessel_i0_imp(x, std::integral_constant()); + return bessel_i0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 113) - return bessel_i0_imp(x, std::integral_constant()); + return bessel_i0_imp(x, boost::math::integral_constant()); BOOST_MATH_ASSERT(0); return 0; } template -inline T bessel_i0(const T& x) +BOOST_MATH_GPU_ENABLED inline T bessel_i0(const T& x) { - typedef std::integral_constant::digits == 0) || (std::numeric_limits::radix != 2)) ? + typedef boost::math::integral_constant::digits == 0) || (boost::math::numeric_limits::radix != 2)) ? 0 : - std::numeric_limits::digits <= 24 ? + boost::math::numeric_limits::digits <= 24 ? 24 : - std::numeric_limits::digits <= 53 ? + boost::math::numeric_limits::digits <= 53 ? 53 : - std::numeric_limits::digits <= 64 ? + boost::math::numeric_limits::digits <= 64 ? 64 : - std::numeric_limits::digits <= 113 ? + boost::math::numeric_limits::digits <= 113 ? 113 : -1 > tag_type; diff --git a/include/boost/math/special_functions/detail/bessel_i1.hpp b/include/boost/math/special_functions/detail/bessel_i1.hpp index badc35de0..d2c750df0 100644 --- a/include/boost/math/special_functions/detail/bessel_i1.hpp +++ b/include/boost/math/special_functions/detail/bessel_i1.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2017 John Maddock +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -17,9 +18,13 @@ #pragma once #endif +#include #include #include #include +#include +#include +#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -38,24 +43,24 @@ namespace boost { namespace math { namespace detail{ template -T bessel_i1(const T& x); +BOOST_MATH_GPU_ENABLED T bessel_i1(const T& x); template -T bessel_i1_imp(const T&, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T&, const boost::math::integral_constant&) { BOOST_MATH_ASSERT(0); return 0; } template -T bessel_i1_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) { //Max error in interpolated form : 1.348e-08 // Max Error found at float precision = Poly : 1.469121e-07 - static const float P[] = { + BOOST_MATH_STATIC const float P[] = { 8.333333221e-02f, 6.944453712e-03f, 3.472097211e-04f, @@ -74,7 +79,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 9.000e-08 // Max Error found at float precision = Poly: 1.044345e-07 - static const float P[] = { + BOOST_MATH_STATIC const float P[] = { 3.98942115977513013e-01f, -1.49581264836620262e-01f, -4.76475741878486795e-02f, @@ -89,7 +94,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) } template -T bessel_i1_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -98,7 +103,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 5.639e-17 // Max Error found at double precision = Poly: 1.795559e-16 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 8.333333333333333803e-02, 6.944444444444341983e-03, 3.472222222225921045e-04, @@ -122,7 +127,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 1.796e-16 // Max Error found at double precision = Poly: 2.898731e-16 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 3.989422804014406054e-01, -1.496033551613111533e-01, -4.675104253598537322e-02, @@ -152,7 +157,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form: 1.320e-19 // Max Error found at double precision = Poly: 7.065357e-17 - static const double P[] = { + BOOST_MATH_STATIC const double P[] = { 3.989422804014314820e-01, -1.496033551467584157e-01, -4.675105322571775911e-02, @@ -167,7 +172,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) } template -T bessel_i1_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -175,7 +180,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Bessel I0 over[10 ^ -16, 7.75] // Max error in interpolated form: 8.086e-21 // Max Error found at float80 precision = Poly: 7.225090e-20 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 8.33333333333333333340071817e-02), BOOST_MATH_BIG_CONSTANT(T, 64, 6.94444444444444442462728070e-03), BOOST_MATH_BIG_CONSTANT(T, 64, 3.47222222222222318886683883e-04), @@ -203,7 +208,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Maximum Deviation Found : 3.887e-20 // Expected Error Term : 3.887e-20 // Maximum Relative Change in Control Points : 1.681e-04 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942260530218897338680e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -1.49599542849073670179540e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -4.70492865454119188276875e-02), @@ -236,7 +241,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 2.101e-03 // Max Error found at float80 precision = Poly : 6.029974e-20 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401431675205845e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -1.49603355149968887210170e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -4.67510486284376330257260e-02), @@ -258,7 +263,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Bessel I0 over[100, INF] // Max error in interpolated form: 2.456e-20 // Max Error found at float80 precision = Poly: 5.446356e-20 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401432677958445e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -1.49603355150537411254359e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -4.67510484842456251368526e-02), @@ -276,7 +281,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) } template -T bessel_i1_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x < 7.75) @@ -285,7 +290,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 1.835e-35 // Max Error found at float128 precision = Poly: 1.645036e-34 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 8.3333333333333333333333333333333331804098e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 6.9444444444444444444444444444445418303082e-03), BOOST_MATH_BIG_CONSTANT(T, 113, 3.4722222222222222222222222222119082346591e-04), @@ -321,7 +326,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 5.204e-03 // Max Error found at float128 precision = Poly : 2.882561e-34 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 8.333333333333333326889717360850080939e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 6.944444444444444511272790848815114507e-03), BOOST_MATH_BIG_CONSTANT(T, 113, 3.472222222222221892451965054394153443e-04), @@ -355,7 +360,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Maximum Deviation Found : 1.766e-35 // Expected Error Term : 1.021e-35 // Maximum Relative Change in Control Points : 6.228e-03 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 8.333333333333255774414858563409941233e-02), BOOST_MATH_BIG_CONSTANT(T, 113, 6.944444444444897867884955912228700291e-03), BOOST_MATH_BIG_CONSTANT(T, 113, 3.472222222220954970397343617150959467e-04), @@ -389,7 +394,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) { // Max error in interpolated form: 8.864e-36 // Max Error found at float128 precision = Poly: 8.522841e-35 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422793693152031514179994954750043e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -1.496029423752889591425633234009799670e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -4.682975926820553021482820043377990241e-02), @@ -421,7 +426,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 6.028e-35 // Max Error found at float128 precision = Poly: 1.368313e-34 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422804012941975429616956496046931e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -1.496033550576049830976679315420681402e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -4.675107835141866009896710750800622147e-02), @@ -456,7 +461,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Max error in interpolated form: 5.494e-35 // Max Error found at float128 precision = Poly: 1.214651e-34 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422804014326779399307367861631577e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -1.496033551505372542086590873271571919e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -4.675104848454290286276466276677172664e-02), @@ -486,7 +491,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) // Bessel I0 over[100, INF] // Max error in interpolated form: 6.081e-35 // Max Error found at float128 precision = Poly: 1.407151e-34 - static const T P[] = { + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793994605993438200208417e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -1.4960335515053725422747977247811372936584e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -4.6751048484542891946087411826356811991039e-02), @@ -512,33 +517,33 @@ T bessel_i1_imp(const T& x, const std::integral_constant&) } template -T bessel_i1_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant&) { if(boost::math::tools::digits() <= 24) - return bessel_i1_imp(x, std::integral_constant()); + return bessel_i1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 53) - return bessel_i1_imp(x, std::integral_constant()); + return bessel_i1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 64) - return bessel_i1_imp(x, std::integral_constant()); + return bessel_i1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 113) - return bessel_i1_imp(x, std::integral_constant()); + return bessel_i1_imp(x, boost::math::integral_constant()); BOOST_MATH_ASSERT(0); return 0; } template -inline T bessel_i1(const T& x) +inline BOOST_MATH_GPU_ENABLED T bessel_i1(const T& x) { - typedef std::integral_constant::digits == 0) || (std::numeric_limits::radix != 2)) ? + typedef boost::math::integral_constant::digits == 0) || (boost::math::numeric_limits::radix != 2)) ? 0 : - std::numeric_limits::digits <= 24 ? + boost::math::numeric_limits::digits <= 24 ? 24 : - std::numeric_limits::digits <= 53 ? + boost::math::numeric_limits::digits <= 53 ? 53 : - std::numeric_limits::digits <= 64 ? + boost::math::numeric_limits::digits <= 64 ? 64 : - std::numeric_limits::digits <= 113 ? + boost::math::numeric_limits::digits <= 113 ? 113 : -1 > tag_type; diff --git a/include/boost/math/special_functions/detail/bessel_ik.hpp b/include/boost/math/special_functions/detail/bessel_ik.hpp index 0c653b475..b3e7378fd 100644 --- a/include/boost/math/special_functions/detail/bessel_ik.hpp +++ b/include/boost/math/special_functions/detail/bessel_ik.hpp @@ -1,4 +1,5 @@ // Copyright (c) 2006 Xiaogang Zhang +// Copyright (c) 2024 Matt Borland // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,14 +11,17 @@ #pragma once #endif -#include -#include +#include +#include +#include +#include +#include +#include #include #include #include #include #include -#include // Modified Bessel functions of the first and second kind of fractional order @@ -30,13 +34,13 @@ struct cyl_bessel_i_small_z { typedef T result_type; - cyl_bessel_i_small_z(T v_, T z_) : k(0), v(v_), mult(z_*z_/4) + BOOST_MATH_GPU_ENABLED cyl_bessel_i_small_z(T v_, T z_) : k(0), v(v_), mult(z_*z_/4) { BOOST_MATH_STD_USING term = 1; } - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T result = term; ++k; @@ -52,7 +56,7 @@ private: }; template -inline T bessel_i_small_z_series(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T bessel_i_small_z_series(T v, T x, const Policy& pol) { BOOST_MATH_STD_USING T prefix; @@ -69,7 +73,7 @@ inline T bessel_i_small_z_series(T v, T x, const Policy& pol) return prefix; cyl_bessel_i_small_z s(v, x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon(), max_iter); @@ -80,7 +84,7 @@ inline T bessel_i_small_z_series(T v, T x, const Policy& pol) // Calculate K(v, x) and K(v+1, x) by method analogous to // Temme, Journal of Computational Physics, vol 21, 343 (1976) template -int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol) +BOOST_MATH_GPU_ENABLED int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol) { T f, h, p, q, coef, sum, sum1, tolerance; T a, b, c, d, sigma, gamma1, gamma2; @@ -157,7 +161,7 @@ int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol) // Evaluate continued fraction fv = I_(v+1) / I_v, derived from // Abramowitz and Stegun, Handbook of Mathematical Functions, 1972, 9.1.73 template -int CF1_ik(T v, T x, T* fv, const Policy& pol) +BOOST_MATH_GPU_ENABLED int CF1_ik(T v, T x, T* fv, const Policy& pol) { T C, D, f, a, b, delta, tiny, tolerance; unsigned long k; @@ -204,7 +208,7 @@ int CF1_ik(T v, T x, T* fv, const Policy& pol) // z1 / z0 = U(v+1.5, 2v+1, 2x) / U(v+0.5, 2v+1, 2x), see // Thompson and Barnett, Computer Physics Communications, vol 47, 245 (1987) template -int CF2_ik(T v, T x, T* Kv, T* Kv1, const Policy& pol) +BOOST_MATH_GPU_ENABLED int CF2_ik(T v, T x, T* Kv, T* Kv1, const Policy& pol) { BOOST_MATH_STD_USING using namespace boost::math::constants; @@ -297,7 +301,7 @@ enum{ // Compute I(v, x) and K(v, x) simultaneously by Temme's method, see // Temme, Journal of Computational Physics, vol 19, 324 (1975) template -int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol) +BOOST_MATH_GPU_ENABLED int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol) { // Kv1 = K_(v+1), fv = I_(v+1) / I_v // Ku1 = K_(u+1), fu = I_(u+1) / I_u @@ -314,7 +318,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol) using namespace boost::math::tools; using namespace boost::math::constants; - static const char* function = "boost::math::bessel_ik<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::bessel_ik<%1%>(%1%,%1%)"; if (v < 0) { @@ -329,7 +333,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol) if (((kind & need_i) == 0) && (fabs(4 * v * v - 25) / (8 * x) < tools::forth_root_epsilon())) { // A&S 9.7.2 - Iv = std::numeric_limits::quiet_NaN(); // any value will do + Iv = boost::math::numeric_limits::quiet_NaN(); // any value will do T mu = 4 * v * v; T eight_z = 8 * x; Kv = 1 + (mu - 1) / eight_z + (mu - 1) * (mu - 9) / (2 * eight_z * eight_z) + (mu - 1) * (mu - 9) * (mu - 25) / (6 * eight_z * eight_z * eight_z); @@ -410,7 +414,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol) } } else - Iv = std::numeric_limits::quiet_NaN(); // any value will do + Iv = boost::math::numeric_limits::quiet_NaN(); // any value will do } if (reflect) { diff --git a/include/boost/math/special_functions/detail/bessel_j0.hpp b/include/boost/math/special_functions/detail/bessel_j0.hpp index 9a0b26fe6..2df027b21 100644 --- a/include/boost/math/special_functions/detail/bessel_j0.hpp +++ b/include/boost/math/special_functions/detail/bessel_j0.hpp @@ -10,6 +10,7 @@ #pragma once #endif +#include #include #include #include @@ -32,10 +33,10 @@ namespace boost { namespace math { namespace detail{ template -T bessel_j0(T x); +BOOST_MATH_GPU_ENABLED T bessel_j0(T x); template -T bessel_j0(T x) +BOOST_MATH_GPU_ENABLED T bessel_j0(T x) { #ifdef BOOST_MATH_INSTRUMENT static bool b = false; @@ -48,7 +49,7 @@ T bessel_j0(T x) } #endif - static const T P1[] = { + BOOST_MATH_STATIC const T P1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -4.1298668500990866786e+11)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.7282507878605942706e+10)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -6.2140700423540120665e+08)), @@ -57,7 +58,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0344222815443188943e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2117036164593528341e-01)) }; - static const T Q1[] = { + BOOST_MATH_STATIC const T Q1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.3883787996332290397e+12)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.6328198300859648632e+10)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.3985097372263433271e+08)), @@ -66,7 +67,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)) }; - static const T P2[] = { + BOOST_MATH_STATIC const T P2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8319397969392084011e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2254078161378989535e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -7.2879702464464618998e+03)), @@ -76,7 +77,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.4321196680624245801e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.8591703355916499363e+01)) }; - static const T Q2[] = { + BOOST_MATH_STATIC const T Q2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.5783478026152301072e+05)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4599102262586308984e+05)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.4055062591169562211e+04)), @@ -86,7 +87,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -2.5258076240801555057e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T PC[] = { + BOOST_MATH_STATIC const T PC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684302e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1345386639580765797e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1170523380864944322e+04)), @@ -94,7 +95,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5376201909008354296e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.8961548424210455236e-01)) }; - static const T QC[] = { + BOOST_MATH_STATIC const T QC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684318e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1370412495510416640e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1215350561880115730e+04)), @@ -102,7 +103,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5711159858080893649e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T PS[] = { + BOOST_MATH_STATIC const T PS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.9226600200800094098e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8591953644342993800e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1183429920482737611e+02)), @@ -110,7 +111,7 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2441026745835638459e+00)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.8033303048680751817e-03)) }; - static const T QS[] = { + BOOST_MATH_STATIC const T QS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.7105024128512061905e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1951131543434613647e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.2642780169211018836e+03)), @@ -118,12 +119,13 @@ T bessel_j0(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 9.0593769594993125859e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4048255576957727686e+00)), - x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5200781102863106496e+00)), - x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.160e+02)), - x12 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.42444230422723137837e-03)), - x21 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4130e+03)), - x22 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.46860286310649596604e-04)); + + BOOST_MATH_STATIC const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4048255576957727686e+00)); + BOOST_MATH_STATIC const T x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5200781102863106496e+00)); + BOOST_MATH_STATIC const T x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.160e+02)); + BOOST_MATH_STATIC const T x12 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.42444230422723137837e-03)); + BOOST_MATH_STATIC const T x21 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4130e+03)); + BOOST_MATH_STATIC const T x22 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.46860286310649596604e-04)); T value, factor, r, rc, rs; diff --git a/include/boost/math/special_functions/detail/bessel_j1.hpp b/include/boost/math/special_functions/detail/bessel_j1.hpp index 6d354dcce..43df9fa0c 100644 --- a/include/boost/math/special_functions/detail/bessel_j1.hpp +++ b/include/boost/math/special_functions/detail/bessel_j1.hpp @@ -10,6 +10,7 @@ #pragma once #endif +#include #include #include #include @@ -32,27 +33,29 @@ namespace boost { namespace math{ namespace detail{ template -T bessel_j1(T x); +BOOST_MATH_GPU_ENABLED T bessel_j1(T x); template struct bessel_j1_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(); } - static void do_init() + BOOST_MATH_GPU_ENABLED static void do_init() { bessel_j1(T(1)); } - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; - static const init initializer; - static void force_instantiate() + BOOST_MATH_STATIC const init initializer; + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -60,11 +63,11 @@ template const typename bessel_j1_initializer::init bessel_j1_initializer::initializer; template -T bessel_j1(T x) +BOOST_MATH_GPU_ENABLED T bessel_j1(T x) { bessel_j1_initializer::force_instantiate(); - static const T P1[] = { + BOOST_MATH_STATIC const T P1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4258509801366645672e+11)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6781041261492395835e+09)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1548696764841276794e+08)), @@ -73,7 +76,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0650724020080236441e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.0767857011487300348e-02)) }; - static const T Q1[] = { + BOOST_MATH_STATIC const T Q1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1868604460820175290e+12)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.2091902282580133541e+10)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.0228375140097033958e+08)), @@ -82,7 +85,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)) }; - static const T P2[] = { + BOOST_MATH_STATIC const T P2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.7527881995806511112e+16)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.6608531731299018674e+15)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.6658018905416665164e+13)), @@ -92,7 +95,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -7.5023342220781607561e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.6179191852758252278e+00)) }; - static const T Q2[] = { + BOOST_MATH_STATIC const T Q2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7253905888447681194e+18)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7128800897135812012e+16)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.4899346165481429307e+13)), @@ -102,7 +105,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.3886978985861357615e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T PC[] = { + BOOST_MATH_STATIC const T PC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278571e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9422465050776411957e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -6.6033732483649391093e+06)), @@ -111,7 +114,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.6116166443246101165e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)) }; - static const T QC[] = { + BOOST_MATH_STATIC const T QC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278568e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9341243899345856590e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -6.5853394797230870728e+06)), @@ -120,7 +123,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4550094401904961825e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T PS[] = { + BOOST_MATH_STATIC const T PS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.3220913409857223519e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.5145160675335701966e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6178836581270835179e+04)), @@ -129,7 +132,7 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.5265133846636032186e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)) }; - static const T QS[] = { + BOOST_MATH_STATIC const T QS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0871281941028743574e+05)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8194580422439972989e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4194606696037208929e+06)), @@ -138,12 +141,13 @@ T bessel_j1(T x) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.6383677696049909675e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)) }; - static const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.8317059702075123156e+00)), - x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0155866698156187535e+00)), - x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 9.810e+02)), - x12 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.2527979248768438556e-04)), - x21 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7960e+03)), - x22 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.8330184381246462950e-05)); + + BOOST_MATH_STATIC const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.8317059702075123156e+00)); + BOOST_MATH_STATIC const T x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0155866698156187535e+00)); + BOOST_MATH_STATIC const T x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 9.810e+02)); + BOOST_MATH_STATIC const T x12 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.2527979248768438556e-04)); + BOOST_MATH_STATIC const T x21 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7960e+03)); + BOOST_MATH_STATIC const T x22 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.8330184381246462950e-05)); T value, factor, r, rc, rs, w; diff --git a/include/boost/math/special_functions/detail/bessel_jn.hpp b/include/boost/math/special_functions/detail/bessel_jn.hpp index a08af0548..73bc0c562 100644 --- a/include/boost/math/special_functions/detail/bessel_jn.hpp +++ b/include/boost/math/special_functions/detail/bessel_jn.hpp @@ -10,6 +10,10 @@ #pragma once #endif +#include +#include +#include +#include #include #include #include @@ -24,7 +28,7 @@ namespace boost { namespace math { namespace detail{ template -T bessel_jn(int n, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T bessel_jn(int n, T x, const Policy& pol) { T value(0), factor, current, prev, next; diff --git a/include/boost/math/special_functions/detail/bessel_jy.hpp b/include/boost/math/special_functions/detail/bessel_jy.hpp index 90e099eb7..143dce872 100644 --- a/include/boost/math/special_functions/detail/bessel_jy.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy.hpp @@ -11,16 +11,18 @@ #endif #include +#include +#include #include #include #include #include #include +#include #include #include #include #include -#include // Bessel functions of the first and second kind of fractional order @@ -38,7 +40,7 @@ namespace boost { namespace math { // try it and see... // template - bool hankel_PQ(T v, T x, T* p, T* q, const Policy& ) + BOOST_MATH_GPU_ENABLED bool hankel_PQ(T v, T x, T* p, T* q, const Policy& ) { BOOST_MATH_STD_USING T tolerance = 2 * policies::get_epsilon(); @@ -70,7 +72,7 @@ namespace boost { namespace math { // Calculate Y(v, x) and Y(v+1, x) by Temme's method, see // Temme, Journal of Computational Physics, vol 21, 343 (1976) template - int temme_jy(T v, T x, T* Y, T* Y1, const Policy& pol) + BOOST_MATH_GPU_ENABLED int temme_jy(T v, T x, T* Y, T* Y1, const Policy& pol) { T g, h, p, q, f, coef, sum, sum1, tolerance; T a, d, e, sigma; @@ -139,7 +141,7 @@ namespace boost { namespace math { // Evaluate continued fraction fv = J_(v+1) / J_v, see // Abramowitz and Stegun, Handbook of Mathematical Functions, 1972, 9.1.73 template - int CF1_jy(T v, T x, T* fv, int* sign, const Policy& pol) + BOOST_MATH_GPU_ENABLED int CF1_jy(T v, T x, T* fv, int* sign, const Policy& pol) { T C, D, f, a, b, delta, tiny, tolerance; unsigned long k; @@ -185,7 +187,7 @@ namespace boost { namespace math { // real values only. // template - int CF2_jy(T v, T x, T* p, T* q, const Policy& pol) + BOOST_MATH_GPU_ENABLED int CF2_jy(T v, T x, T* p, T* q, const Policy& pol) { BOOST_MATH_STD_USING @@ -254,13 +256,13 @@ namespace boost { namespace math { return 0; } - static const int need_j = 1; - static const int need_y = 2; + BOOST_MATH_STATIC const int need_j = 1; + BOOST_MATH_STATIC const int need_y = 2; // Compute J(v, x) and Y(v, x) simultaneously by Steed's method, see // Barnett et al, Computer Physics Communications, vol 8, 377 (1974) template - int bessel_jy(T v, T x, T* J, T* Y, int kind, const Policy& pol) + BOOST_MATH_GPU_ENABLED int bessel_jy(T v, T x, T* J, T* Y, int kind, const Policy& pol) { BOOST_MATH_ASSERT(x >= 0); @@ -273,7 +275,7 @@ namespace boost { namespace math { T cp = 0; T sp = 0; - static const char* function = "boost::math::bessel_jy<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::bessel_jy<%1%>(%1%,%1%)"; BOOST_MATH_STD_USING using namespace boost::math::tools; @@ -284,7 +286,7 @@ namespace boost { namespace math { reflect = true; v = -v; // v is non-negative from here } - if (v > static_cast((std::numeric_limits::max)())) + if (v > static_cast((boost::math::numeric_limits::max)())) { *J = *Y = policies::raise_evaluation_error(function, "Order of Bessel function is too large to evaluate: got %1%", v, pol); return 1; // LCOV_EXCL_LINE previous line will throw. @@ -310,10 +312,10 @@ namespace boost { namespace math { else if(kind & need_j) *J = policies::raise_domain_error(function, "Value of Bessel J_v(x) is complex-infinity at %1%", x, pol); // complex infinity else - *J = std::numeric_limits::quiet_NaN(); // LCOV_EXCL_LINE, we should never get here, any value will do, not using J. + *J = boost::math::numeric_limits::quiet_NaN(); // LCOV_EXCL_LINE, we should never get here, any value will do, not using J. if((kind & need_y) == 0) - *Y = std::numeric_limits::quiet_NaN(); // any value will do, not using Y. + *Y = boost::math::numeric_limits::quiet_NaN(); // any value will do, not using Y. else { // We shoud never get here: @@ -333,7 +335,7 @@ namespace boost { namespace math { // and divergent which leads to large errors :-( // Jv = bessel_j_small_z_series(v, x, pol); - Yv = std::numeric_limits::quiet_NaN(); + Yv = boost::math::numeric_limits::quiet_NaN(); } else if((x < 1) && (u != 0) && (log(policies::get_epsilon() / 2) > v * log((x/2) * (x/2) / v))) { @@ -344,7 +346,7 @@ namespace boost { namespace math { if(kind&need_j) Jv = bessel_j_small_z_series(v, x, pol); else - Jv = std::numeric_limits::quiet_NaN(); + Jv = boost::math::numeric_limits::quiet_NaN(); if((org_kind&need_y && (!reflect || (cp != 0))) || (org_kind & need_j && (reflect && (sp != 0)))) { @@ -352,7 +354,7 @@ namespace boost { namespace math { Yv = bessel_y_small_z_series(v, x, &Yv_scale, pol); } else - Yv = std::numeric_limits::quiet_NaN(); + Yv = boost::math::numeric_limits::quiet_NaN(); } else if((u == 0) && (x < policies::get_epsilon())) { @@ -363,7 +365,7 @@ namespace boost { namespace math { if(kind&need_j) Jv = bessel_j_small_z_series(v, x, pol); else - Jv = std::numeric_limits::quiet_NaN(); + Jv = boost::math::numeric_limits::quiet_NaN(); if((org_kind&need_y && (!reflect || (cp != 0))) || (org_kind & need_j && (reflect && (sp != 0)))) { @@ -371,7 +373,7 @@ namespace boost { namespace math { Yv = bessel_yn_small_z(n, x, &Yv_scale, pol); } else - Yv = std::numeric_limits::quiet_NaN(); + Yv = boost::math::numeric_limits::quiet_NaN(); // LCOV_EXCL_STOP } else if(asymptotic_bessel_large_x_limit(v, x)) @@ -381,13 +383,13 @@ namespace boost { namespace math { Yv = asymptotic_bessel_y_large_x_2(v, x, pol); } else - Yv = std::numeric_limits::quiet_NaN(); // any value will do, we're not using it. + Yv = boost::math::numeric_limits::quiet_NaN(); // any value will do, we're not using it. if(kind&need_j) { Jv = asymptotic_bessel_j_large_x_2(v, x, pol); } else - Jv = std::numeric_limits::quiet_NaN(); // any value will do, we're not using it. + Jv = boost::math::numeric_limits::quiet_NaN(); // any value will do, we're not using it. } else if((x > 8) && hankel_PQ(v, x, &p, &q, pol)) { @@ -449,7 +451,7 @@ namespace boost { namespace math { Jv = scale * W / (Yv * fv - Yv1); // Wronskian relation } else - Jv = std::numeric_limits::quiet_NaN(); // any value will do, we're not using it. + Jv = boost::math::numeric_limits::quiet_NaN(); // any value will do, we're not using it. Yv_scale = scale; } else // x in (2, \infty) @@ -564,7 +566,7 @@ namespace boost { namespace math { Yv = prev; } else - Yv = std::numeric_limits::quiet_NaN(); // any value will do, we're not using it. + Yv = boost::math::numeric_limits::quiet_NaN(); // any value will do, we're not using it. } if (reflect) diff --git a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp index cb09b202d..51e4efafc 100644 --- a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp @@ -16,12 +16,15 @@ #pragma once #endif +#include +#include #include +#include namespace boost{ namespace math{ namespace detail{ template -inline T asymptotic_bessel_amplitude(T v, T x) +BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_amplitude(T v, T x) { // Calculate the amplitude of J(v, x) and Y(v, x) for large // x: see A&S 9.2.28. @@ -39,7 +42,7 @@ inline T asymptotic_bessel_amplitude(T v, T x) } template -T asymptotic_bessel_phase_mx(T v, T x) +BOOST_MATH_GPU_ENABLED T asymptotic_bessel_phase_mx(T v, T x) { // // Calculate the phase of J(v, x) and Y(v, x) for large x. @@ -63,7 +66,7 @@ T asymptotic_bessel_phase_mx(T v, T x) } template -inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol) { // See A&S 9.2.19. BOOST_MATH_STD_USING @@ -93,7 +96,7 @@ inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol) } template -inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol) { // See A&S 9.2.19. BOOST_MATH_STD_USING @@ -124,7 +127,7 @@ inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol) } template -inline bool asymptotic_bessel_large_x_limit(int v, const T& x) +BOOST_MATH_GPU_ENABLED inline bool asymptotic_bessel_large_x_limit(int v, const T& x) { BOOST_MATH_STD_USING // @@ -142,7 +145,7 @@ inline bool asymptotic_bessel_large_x_limit(int v, const T& x) } template -inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x) +BOOST_MATH_GPU_ENABLED inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x) { BOOST_MATH_STD_USING // @@ -155,11 +158,11 @@ inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x) // error rates either side of the divide for v < 10000. // At double precision eps^1/8 ~= 0.01. // - return (std::max)(T(fabs(v)), T(1)) < x * sqrt(tools::forth_root_epsilon()); + return BOOST_MATH_GPU_SAFE_MAX(T(fabs(v)), T(1)) < x * sqrt(tools::forth_root_epsilon()); } template -void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol) +BOOST_MATH_GPU_ENABLED void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol) { T c = 1; T p = (v / boost::math::sin_pi(v, pol)) * pow(x / 2, -v) / boost::math::tgamma(1 - v, pol); @@ -193,7 +196,7 @@ void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol) } template -T asymptotic_bessel_i_large_x(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T asymptotic_bessel_i_large_x(T v, T x, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names T s = 1; diff --git a/include/boost/math/special_functions/detail/bessel_jy_series.hpp b/include/boost/math/special_functions/detail/bessel_jy_series.hpp index db46f3640..5c083f348 100644 --- a/include/boost/math/special_functions/detail/bessel_jy_series.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy_series.hpp @@ -10,10 +10,9 @@ #pragma once #endif -#include -#include #include #include +#include namespace boost { namespace math { namespace detail{ @@ -22,7 +21,7 @@ struct bessel_j_small_z_series_term { typedef T result_type; - bessel_j_small_z_series_term(T v_, T x) + BOOST_MATH_GPU_ENABLED bessel_j_small_z_series_term(T v_, T x) : N(0), v(v_) { BOOST_MATH_STD_USING @@ -30,7 +29,7 @@ struct bessel_j_small_z_series_term mult *= -mult; term = 1; } - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T r = term; ++N; @@ -49,7 +48,7 @@ private: // Converges rapidly for all z << v. // template -inline T bessel_j_small_z_series(T v, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T bessel_j_small_z_series(T v, T x, const Policy& pol) { BOOST_MATH_STD_USING T prefix; @@ -66,7 +65,7 @@ inline T bessel_j_small_z_series(T v, T x, const Policy& pol) return prefix; bessel_j_small_z_series_term s(v, x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon(), max_iter); @@ -79,7 +78,7 @@ struct bessel_y_small_z_series_term_a { typedef T result_type; - bessel_y_small_z_series_term_a(T v_, T x) + BOOST_MATH_GPU_ENABLED bessel_y_small_z_series_term_a(T v_, T x) : N(0), v(v_) { BOOST_MATH_STD_USING @@ -87,7 +86,7 @@ struct bessel_y_small_z_series_term_a mult *= -mult; term = 1; } - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { BOOST_MATH_STD_USING T r = term; @@ -107,7 +106,7 @@ struct bessel_y_small_z_series_term_b { typedef T result_type; - bessel_y_small_z_series_term_b(T v_, T x) + BOOST_MATH_GPU_ENABLED bessel_y_small_z_series_term_b(T v_, T x) : N(0), v(v_) { BOOST_MATH_STD_USING @@ -115,7 +114,7 @@ struct bessel_y_small_z_series_term_b mult *= -mult; term = 1; } - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { T r = term; ++N; @@ -138,10 +137,10 @@ private: // eps/2 * v^v(x/2)^-v > (x/2)^v or log(eps/2) > v log((x/2)^2/v) // template -inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol) { BOOST_MATH_STD_USING - static const char* function = "bessel_y_small_z_series<%1%>(%1%,%1%)"; + constexpr auto function = "bessel_y_small_z_series<%1%>(%1%,%1%)"; T prefix; T gam; T p = log(x / 2); @@ -183,7 +182,7 @@ inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol) prefix = -exp(prefix); } bessel_y_small_z_series_term_a s(v, x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); *pscale = scale; T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon(), max_iter); @@ -211,7 +210,7 @@ inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol) } template -T bessel_yn_small_z(int n, T z, T* scale, const Policy& pol) +BOOST_MATH_GPU_ENABLED T bessel_yn_small_z(int n, T z, T* scale, const Policy& pol) { // // See http://functions.wolfram.com/Bessel-TypeFunctions/BesselY/06/01/04/01/02/ diff --git a/include/boost/math/special_functions/detail/bessel_jy_zero.hpp b/include/boost/math/special_functions/detail/bessel_jy_zero.hpp index cb1fc48d8..15671c0df 100644 --- a/include/boost/math/special_functions/detail/bessel_jy_zero.hpp +++ b/include/boost/math/special_functions/detail/bessel_jy_zero.hpp @@ -18,19 +18,30 @@ #ifndef BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_ #define BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_ - #include + #include + #include + #include + #include + #include #include - #include #include #include + #ifndef BOOST_MATH_HAS_NVRTC + #include + #endif + + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_suppress 20012 + #endif + namespace boost { namespace math { namespace detail { namespace bessel_zero { template - T equation_nist_10_21_19(const T& v, const T& a) + BOOST_MATH_GPU_ENABLED T equation_nist_10_21_19(const T& v, const T& a) { // Get the initial estimate of the m'th root of Jv or Yv. // This subroutine is used for the order m with m > 1. @@ -57,11 +68,11 @@ class equation_as_9_3_39_and_its_derivative { public: - explicit equation_as_9_3_39_and_its_derivative(const T& zt) : zeta(zt) { } + BOOST_MATH_GPU_ENABLED explicit equation_as_9_3_39_and_its_derivative(const T& zt) : zeta(zt) { } - equation_as_9_3_39_and_its_derivative(const equation_as_9_3_39_and_its_derivative&) = default; + BOOST_MATH_GPU_ENABLED equation_as_9_3_39_and_its_derivative(const equation_as_9_3_39_and_its_derivative&) = default; - boost::math::tuple operator()(const T& z) const + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& z) const { BOOST_MATH_STD_USING // ADL of std names, needed for acos, sqrt. @@ -86,7 +97,7 @@ }; template - static T equation_as_9_5_26(const T& v, const T& ai_bi_root, const Policy& pol) + BOOST_MATH_GPU_ENABLED T equation_as_9_5_26(const T& v, const T& ai_bi_root, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for log, sqrt. @@ -132,9 +143,9 @@ // Select the maximum allowed iterations based on the number // of decimal digits in the numeric type T, being at least 12. - const auto iterations_allowed = static_cast((std::max)(12, my_digits10 * 2)); + const auto iterations_allowed = static_cast(BOOST_MATH_GPU_SAFE_MAX(12, my_digits10 * 2)); - std::uintmax_t iterations_used = iterations_allowed; + boost::math::uintmax_t iterations_used = iterations_allowed; // Calculate the root of z as a function of zeta. const T z = boost::math::tools::newton_raphson_iterate( @@ -142,7 +153,7 @@ z_estimate, range_zmin, range_zmax, - (std::min)(boost::math::tools::digits(), boost::math::tools::digits()), + BOOST_MATH_GPU_SAFE_MIN(boost::math::tools::digits(), boost::math::tools::digits()), iterations_used); static_cast(iterations_used); @@ -168,7 +179,7 @@ namespace cyl_bessel_j_zero_detail { template - T equation_nist_10_21_40_a(const T& v, const Policy& pol) + BOOST_MATH_GPU_ENABLED T equation_nist_10_21_40_a(const T& v, const Policy& pol) { const T v_pow_third(boost::math::cbrt(v, pol)); const T v_pow_minus_two_thirds(T(1) / (v_pow_third * v_pow_third)); @@ -185,13 +196,13 @@ class function_object_jv { public: - function_object_jv(const T& v, + BOOST_MATH_GPU_ENABLED function_object_jv(const T& v, const Policy& pol) : my_v(v), my_pol(pol) { } - function_object_jv(const function_object_jv&) = default; + BOOST_MATH_GPU_ENABLED function_object_jv(const function_object_jv&) = default; - T operator()(const T& x) const + BOOST_MATH_GPU_ENABLED T operator()(const T& x) const { return boost::math::cyl_bessel_j(my_v, x, my_pol); } @@ -206,15 +217,16 @@ class function_object_jv_and_jv_prime { public: - function_object_jv_and_jv_prime(const T& v, - const bool order_is_zero, - const Policy& pol) : my_v(v), + BOOST_MATH_GPU_ENABLED function_object_jv_and_jv_prime( + const T& v, + const bool order_is_zero, + const Policy& pol) : my_v(v), my_order_is_zero(order_is_zero), my_pol(pol) { } function_object_jv_and_jv_prime(const function_object_jv_and_jv_prime&) = default; - boost::math::tuple operator()(const T& x) const + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& x) const { // Obtain Jv(x) and Jv'(x). // Chris's original code called the Bessel function implementation layer direct, @@ -246,10 +258,10 @@ const function_object_jv_and_jv_prime& operator=(const function_object_jv_and_jv_prime&) = delete; }; - template bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; } + template BOOST_MATH_GPU_ENABLED bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; } template - T initial_guess(const T& v, const int m, const Policy& pol) + BOOST_MATH_GPU_ENABLED T initial_guess(const T& v, const int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for floor. @@ -325,7 +337,7 @@ } // Perform several steps of bisection iteration to refine the guess. - std::uintmax_t number_of_iterations(12U); + boost::math::uintmax_t number_of_iterations(12U); // Do the bisection iteration. const boost::math::tuple guess_pair = @@ -390,7 +402,7 @@ namespace cyl_neumann_zero_detail { template - T equation_nist_10_21_40_b(const T& v, const Policy& pol) + BOOST_MATH_GPU_ENABLED T equation_nist_10_21_40_b(const T& v, const Policy& pol) { const T v_pow_third(boost::math::cbrt(v, pol)); const T v_pow_minus_two_thirds(T(1) / (v_pow_third * v_pow_third)); @@ -407,13 +419,13 @@ class function_object_yv { public: - function_object_yv(const T& v, - const Policy& pol) : my_v(v), - my_pol(pol) { } + BOOST_MATH_GPU_ENABLED function_object_yv(const T& v, + const Policy& pol) : my_v(v), + my_pol(pol) { } - function_object_yv(const function_object_yv&) = default; + BOOST_MATH_GPU_ENABLED function_object_yv(const function_object_yv&) = default; - T operator()(const T& x) const + BOOST_MATH_GPU_ENABLED T operator()(const T& x) const { return boost::math::cyl_neumann(my_v, x, my_pol); } @@ -428,13 +440,13 @@ class function_object_yv_and_yv_prime { public: - function_object_yv_and_yv_prime(const T& v, - const Policy& pol) : my_v(v), - my_pol(pol) { } + BOOST_MATH_GPU_ENABLED function_object_yv_and_yv_prime(const T& v, + const Policy& pol) : my_v(v), + my_pol(pol) { } - function_object_yv_and_yv_prime(const function_object_yv_and_yv_prime&) = default; + BOOST_MATH_GPU_ENABLED function_object_yv_and_yv_prime(const function_object_yv_and_yv_prime&) = default; - boost::math::tuple operator()(const T& x) const + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(const T& x) const { const T half_epsilon(boost::math::tools::epsilon() / 2U); @@ -469,10 +481,10 @@ const function_object_yv_and_yv_prime& operator=(const function_object_yv_and_yv_prime&) = delete; }; - template bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; } + template BOOST_MATH_GPU_ENABLED bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; } template - T initial_guess(const T& v, const int m, const Policy& pol) + BOOST_MATH_GPU_ENABLED T initial_guess(const T& v, const int m, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names, needed for floor. @@ -560,7 +572,7 @@ } // Perform several steps of bisection iteration to refine the guess. - std::uintmax_t number_of_iterations(12U); + boost::math::uintmax_t number_of_iterations(12U); // Do the bisection iteration. const boost::math::tuple guess_pair = @@ -624,4 +636,8 @@ } // namespace bessel_zero } } } // namespace boost::math::detail + #ifdef BOOST_MATH_ENABLE_CUDA + # pragma nv_diag_default 20012 + #endif + #endif // BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_ diff --git a/include/boost/math/special_functions/detail/bessel_k0.hpp b/include/boost/math/special_functions/detail/bessel_k0.hpp index f29ffa75c..bab202b6c 100644 --- a/include/boost/math/special_functions/detail/bessel_k0.hpp +++ b/include/boost/math/special_functions/detail/bessel_k0.hpp @@ -13,10 +13,14 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include +#include +#include +#include #include #include -#include #include +#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -44,35 +48,37 @@ namespace boost { namespace math { namespace detail{ template -T bessel_k0(const T& x); +BOOST_MATH_GPU_ENABLED T bessel_k0(const T& x); template struct bessel_k0_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(tag()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { bessel_k0(T(0.5)); bessel_k0(T(1.5)); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { bessel_k0(T(0.5)); bessel_k0(T(1.5)); } template - static void do_init(const U&){} - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED static void do_init(const U&){} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; - static const init initializer; - static void force_instantiate() + BOOST_MATH_STATIC const init initializer; + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -81,14 +87,14 @@ const typename bessel_k0_initializer::init bessel_k0_initializer template -T bessel_k0_imp(const T&, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T&, const boost::math::integral_constant&) { BOOST_MATH_ASSERT(0); return 0; } template -T bessel_k0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -97,14 +103,14 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : -2.358e-09 // Maximum Relative Change in Control Points : 9.552e-02 // Max Error found at float precision = Poly : 4.448220e-08 - static const T Y = 1.137250900268554688f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.137250900268554688f; + BOOST_MATH_STATIC const T P[] = { -1.372508979104259711e-01f, 2.622545986273687617e-01f, 5.047103728247919836e-03f }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000000000000e+00f, -8.928694018000029415e-02f, @@ -117,7 +123,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : -1.343e-09 // Maximum Relative Change in Control Points : 2.405e-02 // Max Error found at float precision = Poly : 1.354814e-07 - static const T P2[] = { + BOOST_MATH_STATIC const T P2[] = { 1.159315158e-01f, 2.789828686e-01f, 2.524902861e-02f, @@ -133,14 +139,14 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 9.064e-02 // Max Error found at float precision = Poly : 5.065020e-08 - static const T P[] = + BOOST_MATH_STATIC const T P[] = { 2.533141220e-01f, 5.221502603e-01f, 6.380180669e-02f, -5.934976547e-02f }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000e+00f, 2.679722431e+00f, @@ -158,7 +164,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) } template -T bessel_k0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -167,8 +173,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : -6.077e-17 // Maximum Relative Change in Control Points : 7.797e-02 // Max Error found at double precision = Poly : 1.003156e-16 - static const T Y = 1.137250900268554688; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.137250900268554688; + BOOST_MATH_STATIC const T P[] = { -1.372509002685546267e-01, 2.574916117833312855e-01, @@ -176,7 +182,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) 5.445476986653926759e-04, 7.125159422136622118e-06 }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000000000000e+00, -5.458333438017788530e-02, @@ -191,7 +197,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 3.392e-18 // Maximum Relative Change in Control Points : 2.041e-02 // Max Error found at double precision = Poly : 2.513112e-16 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { 1.159315156584124484e-01, 2.789828789146031732e-01, @@ -212,8 +218,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Maximum Relative Change in Control Points : 2.757e-01 // Max Error found at double precision = Poly : 1.001560e-16 - static const T Y = 1; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1; + BOOST_MATH_STATIC const T P[] = { 2.533141373155002416e-01, 3.628342133984595192e+00, @@ -225,7 +231,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) -1.414237994269995877e+00, -9.369168119754924625e-02 }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000000000000e+00, 1.494194694879908328e+01, @@ -248,7 +254,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) } template -T bessel_k0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -257,8 +263,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 2.180e-22 // Maximum Relative Change in Control Points : 2.943e-01 // Max Error found at float80 precision = Poly : 3.923207e-20 - static const T Y = 1.137250900268554687500e+00; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.137250900268554687500e+00; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -1.372509002685546875002e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 2.566481981037407600436e-01), @@ -267,7 +273,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) BOOST_MATH_BIG_CONSTANT(T, 64, 1.213747930378196492543e-05), BOOST_MATH_BIG_CONSTANT(T, 64, 9.423709328020389560844e-08) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, -4.843828412587773008342e-02), @@ -284,7 +290,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : -2.434e-21 // Maximum Relative Change in Control Points : 2.459e-02 // Max Error found at float80 precision = Poly : 1.482487e-19 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.159315156584124488110e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 2.764832791416047889734e-01), @@ -292,7 +298,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) BOOST_MATH_BIG_CONSTANT(T, 64, 3.660777862036966089410e-04), BOOST_MATH_BIG_CONSTANT(T, 64, 2.094942446930673386849e-06) }; - static const T Q2[] = + BOOST_MATH_STATIC const T Q2[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, -2.156100313881251616320e-02), @@ -308,8 +314,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 2.236e-21 // Maximum Relative Change in Control Points : 3.021e-01 //Max Error found at float80 precision = Poly : 8.727378e-20 - static const T Y = 1; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 2.533141373155002512056e-01), BOOST_MATH_BIG_CONSTANT(T, 64, 5.417942070721928652715e+00), @@ -323,7 +329,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) BOOST_MATH_BIG_CONSTANT(T, 64, -4.059789241612946683713e+00), BOOST_MATH_BIG_CONSTANT(T, 64, -1.612783121537333908889e-01) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, 2.200669254769325861404e+01), @@ -348,7 +354,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) } template -T bessel_k0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -357,8 +363,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 5.682e-37 // Maximum Relative Change in Control Points : 6.094e-04 // Max Error found at float128 precision = Poly : 5.338213e-35 - static const T Y = 1.137250900268554687500000000000000000e+00f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.137250900268554687500000000000000000e+00f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, -1.372509002685546875000000000000000006e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.556212905071072782462974351698081303e-01), @@ -369,7 +375,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) BOOST_MATH_BIG_CONSTANT(T, 113, 1.752489221949580551692915881999762125e-09), BOOST_MATH_BIG_CONSTANT(T, 113, 5.243010555737173524710512824955368526e-12) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, -4.095631064064621099785696980653193721e-02), @@ -387,7 +393,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 5.105e-38 // Maximum Relative Change in Control Points : 9.734e-03 // Max Error found at float128 precision = Poly : 1.688806e-34 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.159315156584124488107200313757741370e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.789828789146031122026800078439435369e-01), @@ -413,8 +419,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) // Expected Error Term : 4.917e-40 // Maximum Relative Change in Control Points : 3.385e-01 // Max Error found at float128 precision = Poly : 1.567573e-34 - static const T Y = 1; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 2.533141373155002512078826424055226265e-01), BOOST_MATH_BIG_CONSTANT(T, 113, 2.001949740768235770078339977110749204e+01), @@ -439,7 +445,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) BOOST_MATH_BIG_CONSTANT(T, 113, -4.201632288615609937883545928660649813e+03), BOOST_MATH_BIG_CONSTANT(T, 113, -3.690820607338480548346746717311811406e+01) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, 7.964877874035741452203497983642653107e+01), @@ -475,33 +481,33 @@ T bessel_k0_imp(const T& x, const std::integral_constant&) } template -T bessel_k0_imp(const T& x, const std::integral_constant&) +BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant&) { if(boost::math::tools::digits() <= 24) - return bessel_k0_imp(x, std::integral_constant()); + return bessel_k0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 53) - return bessel_k0_imp(x, std::integral_constant()); + return bessel_k0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 64) - return bessel_k0_imp(x, std::integral_constant()); + return bessel_k0_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 113) - return bessel_k0_imp(x, std::integral_constant()); + return bessel_k0_imp(x, boost::math::integral_constant()); BOOST_MATH_ASSERT(0); return 0; } template -inline T bessel_k0(const T& x) +BOOST_MATH_GPU_ENABLED inline T bessel_k0(const T& x) { - typedef std::integral_constant::digits == 0) || (std::numeric_limits::radix != 2)) ? + typedef boost::math::integral_constant::digits == 0) || (boost::math::numeric_limits::radix != 2)) ? 0 : - std::numeric_limits::digits <= 24 ? + boost::math::numeric_limits::digits <= 24 ? 24 : - std::numeric_limits::digits <= 53 ? + boost::math::numeric_limits::digits <= 53 ? 53 : - std::numeric_limits::digits <= 64 ? + boost::math::numeric_limits::digits <= 64 ? 64 : - std::numeric_limits::digits <= 113 ? + boost::math::numeric_limits::digits <= 113 ? 113 : -1 > tag_type; diff --git a/include/boost/math/special_functions/detail/bessel_k1.hpp b/include/boost/math/special_functions/detail/bessel_k1.hpp index bd37f9021..49846dc8c 100644 --- a/include/boost/math/special_functions/detail/bessel_k1.hpp +++ b/include/boost/math/special_functions/detail/bessel_k1.hpp @@ -13,6 +13,10 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include +#include +#include +#include #include #include #include @@ -44,36 +48,38 @@ namespace boost { namespace math { namespace detail{ template - T bessel_k1(const T&); + BOOST_MATH_GPU_ENABLED T bessel_k1(const T&); template struct bessel_k1_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(tag()); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { bessel_k1(T(0.5)); bessel_k1(T(2)); bessel_k1(T(6)); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { bessel_k1(T(0.5)); bessel_k1(T(6)); } template - static void do_init(const U&) {} - void force_instantiate()const {} + BOOST_MATH_GPU_ENABLED static void do_init(const U&) {} + BOOST_MATH_GPU_ENABLED void force_instantiate()const {} }; - static const init initializer; - static void force_instantiate() + BOOST_MATH_STATIC const init initializer; + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -82,14 +88,14 @@ namespace boost { namespace math { namespace detail{ template - inline T bessel_k1_imp(const T&, const std::integral_constant&) + inline BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T&, const boost::math::integral_constant&) { BOOST_MATH_ASSERT(0); return 0; } template - T bessel_k1_imp(const T& x, const std::integral_constant&) + BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -98,14 +104,14 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -3.053e-12 // Maximum Relative Change in Control Points : 4.927e-02 // Max Error found at float precision = Poly : 7.918347e-10 - static const T Y = 8.695471287e-02f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 8.695471287e-02f; + BOOST_MATH_STATIC const T P[] = { -3.621379531e-03f, 7.131781976e-03f, -1.535278300e-05f }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000e+00f, -5.173102701e-02f, @@ -118,7 +124,7 @@ namespace boost { namespace math { namespace detail{ // Maximum Deviation Found: 3.556e-08 // Expected Error Term : -3.541e-08 // Maximum Relative Change in Control Points : 8.203e-02 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { -3.079657469e-01f, -8.537108913e-02f, @@ -134,15 +140,15 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -3.227e-08 // Maximum Relative Change in Control Points : 9.917e-02 // Max Error found at float precision = Poly : 6.084411e-08 - static const T Y = 1.450342178f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.450342178f; + BOOST_MATH_STATIC const T P[] = { -1.970280088e-01f, 2.188747807e-02f, 7.270394756e-01f, 2.490678196e-01f }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.000000000e+00f, 2.274292882e+00f, @@ -160,7 +166,7 @@ namespace boost { namespace math { namespace detail{ } template - T bessel_k1_imp(const T& x, const std::integral_constant&) + BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -169,15 +175,15 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : 1.921e-17 // Maximum Relative Change in Control Points : 5.287e-03 // Max Error found at double precision = Poly : 2.004747e-17 - static const T Y = 8.69547128677368164e-02f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 8.69547128677368164e-02f; + BOOST_MATH_STATIC const T P[] = { -3.62137953440350228e-03, 7.11842087490330300e-03, 1.00302560256614306e-05, 1.77231085381040811e-06 }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.00000000000000000e+00, -4.80414794429043831e-02, @@ -193,14 +199,14 @@ namespace boost { namespace math { namespace detail{ // Maximum Relative Change in Control Points : 3.103e-04 // Max Error found at double precision = Poly : 1.246698e-16 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { -3.07965757829206184e-01, -7.80929703673074907e-02, -2.70619343754051620e-03, -2.49549522229072008e-05 }; - static const T Q2[] = + BOOST_MATH_STATIC const T Q2[] = { 1.00000000000000000e+00, -2.36316836412163098e-02, @@ -217,8 +223,8 @@ namespace boost { namespace math { namespace detail{ // Maximum Relative Change in Control Points : 2.786e-01 // Max Error found at double precision = Poly : 1.258798e-16 - static const T Y = 1.45034217834472656f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.45034217834472656f; + BOOST_MATH_STATIC const T P[] = { -1.97028041029226295e-01, -2.32408961548087617e+00, @@ -230,7 +236,7 @@ namespace boost { namespace math { namespace detail{ 6.62582288933739787e+00, 3.08851840645286691e-01 }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { 1.00000000000000000e+00, 1.41811409298826118e+01, @@ -253,7 +259,7 @@ namespace boost { namespace math { namespace detail{ } template - T bessel_k1_imp(const T& x, const std::integral_constant&) + BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -262,8 +268,8 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -5.548e-23 // Maximum Relative Change in Control Points : 2.002e-03 // Max Error found at float80 precision = Poly : 9.352785e-22 - static const T Y = 8.695471286773681640625e-02f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 8.695471286773681640625e-02f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -3.621379534403483072861e-03), BOOST_MATH_BIG_CONSTANT(T, 64, 7.102135866103952705932e-03), @@ -271,7 +277,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 64, 2.537484002571894870830e-06), BOOST_MATH_BIG_CONSTANT(T, 64, 6.603228256820000135990e-09) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, -4.354457194045068370363e-02), @@ -287,7 +293,7 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : 1.995e-23 // Maximum Relative Change in Control Points : 8.174e-04 // Max Error found at float80 precision = Poly : 4.137325e-20 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -3.079657578292062244054e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -7.963049154965966503231e-02), @@ -295,7 +301,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 64, -4.023052834702215699504e-05), BOOST_MATH_BIG_CONSTANT(T, 64, -1.719459155018493821839e-07) }; - static const T Q2[] = + BOOST_MATH_STATIC const T Q2[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, -1.863917670410152669768e-02), @@ -312,8 +318,8 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -3.302e-21 // Maximum Relative Change in Control Points : 3.432e-01 // Max Error found at float80 precision = Poly : 1.083755e-19 - static const T Y = 1.450342178344726562500e+00f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.450342178344726562500e+00f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -1.970280410292263112917e-01), BOOST_MATH_BIG_CONSTANT(T, 64, -4.058564803062959169322e+00), @@ -328,7 +334,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 64, 4.319614662598089438939e+00), BOOST_MATH_BIG_CONSTANT(T, 64, 3.710715864316521856193e-02) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 64, 2.298433045824439052398e+01), @@ -353,7 +359,7 @@ namespace boost { namespace math { namespace detail{ } template - T bessel_k1_imp(const T& x, const std::integral_constant&) + BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant&) { BOOST_MATH_STD_USING if(x <= 1) @@ -362,8 +368,8 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -7.119e-35 // Maximum Relative Change in Control Points : 1.207e-03 // Max Error found at float128 precision = Poly : 7.143688e-35 - static const T Y = 8.695471286773681640625000000000000000e-02f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 8.695471286773681640625000000000000000e-02f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, -3.621379534403483072916666666666595475e-03), BOOST_MATH_BIG_CONSTANT(T, 113, 7.074117676930975433219826471336547627e-03), @@ -373,7 +379,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 113, 2.347140307321161346703214099534250263e-10), BOOST_MATH_BIG_CONSTANT(T, 113, 5.569608494081482873946791086435679661e-13) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, -3.580768910152105375615558920428350204e-02), @@ -391,7 +397,7 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : 4.473e-37 // Maximum Relative Change in Control Points : 8.550e-04 // Max Error found at float128 precision = Poly : 8.167701e-35 - static const T P2[] = + BOOST_MATH_STATIC const T P2[] = { BOOST_MATH_BIG_CONSTANT(T, 113, -3.079657578292062244053600156878870690e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -8.133183745732467770755578848987414875e-02), @@ -401,7 +407,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 113, -1.632502325880313239698965376754406011e-09), BOOST_MATH_BIG_CONSTANT(T, 113, -2.311973065898784812266544485665624227e-12) }; - static const T Q2[] = + BOOST_MATH_STATIC const T Q2[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, -1.311471216733781016657962995723287450e-02), @@ -418,8 +424,8 @@ namespace boost { namespace math { namespace detail{ { // Max error in interpolated form: 5.307e-37 // Max Error found at float128 precision = Poly: 7.087862e-35 - static const T Y = 1.5023040771484375f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.5023040771484375f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, -2.489899398329369710528254347931380044e-01), BOOST_MATH_BIG_CONSTANT(T, 113, -6.819080211203854781858815596508456873e+00), @@ -438,7 +444,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 113, 1.039705646510167437971862966128055524e+00), BOOST_MATH_BIG_CONSTANT(T, 113, 1.008418100718254816100425022904039530e-02) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, 2.927456835239137986889227412815459529e+01), @@ -465,8 +471,8 @@ namespace boost { namespace math { namespace detail{ // Expected Error Term : -6.565e-40 // Maximum Relative Change in Control Points : 1.880e-01 // Max Error found at float128 precision = Poly : 2.943572e-35 - static const T Y = 1.308816909790039062500000000000000000f; - static const T P[] = + BOOST_MATH_STATIC const T Y = 1.308816909790039062500000000000000000f; + BOOST_MATH_STATIC const T P[] = { BOOST_MATH_BIG_CONSTANT(T, 113, -5.550277247453881129211735759447737350e-02), BOOST_MATH_BIG_CONSTANT(T, 113, -3.485883080219574328217554864956175929e+00), @@ -486,7 +492,7 @@ namespace boost { namespace math { namespace detail{ BOOST_MATH_BIG_CONSTANT(T, 113, 8.981057433937398731355768088809437625e+05), BOOST_MATH_BIG_CONSTANT(T, 113, 2.519440069856232098711793483639792952e+04) }; - static const T Q[] = + BOOST_MATH_STATIC const T Q[] = { BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00), BOOST_MATH_BIG_CONSTANT(T, 113, 7.127348248283623146544565916604103560e+01), @@ -517,33 +523,33 @@ namespace boost { namespace math { namespace detail{ } template - T bessel_k1_imp(const T& x, const std::integral_constant&) + BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant&) { if(boost::math::tools::digits() <= 24) - return bessel_k1_imp(x, std::integral_constant()); + return bessel_k1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 53) - return bessel_k1_imp(x, std::integral_constant()); + return bessel_k1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 64) - return bessel_k1_imp(x, std::integral_constant()); + return bessel_k1_imp(x, boost::math::integral_constant()); else if(boost::math::tools::digits() <= 113) - return bessel_k1_imp(x, std::integral_constant()); + return bessel_k1_imp(x, boost::math::integral_constant()); BOOST_MATH_ASSERT(0); return 0; } - template - inline T bessel_k1(const T& x) + template + inline BOOST_MATH_GPU_ENABLED T bessel_k1(const T& x) { - typedef std::integral_constant::digits == 0) || (std::numeric_limits::radix != 2)) ? + typedef boost::math::integral_constant::digits == 0) || (boost::math::numeric_limits::radix != 2)) ? 0 : - std::numeric_limits::digits <= 24 ? + boost::math::numeric_limits::digits <= 24 ? 24 : - std::numeric_limits::digits <= 53 ? + boost::math::numeric_limits::digits <= 53 ? 53 : - std::numeric_limits::digits <= 64 ? + boost::math::numeric_limits::digits <= 64 ? 64 : - std::numeric_limits::digits <= 113 ? + boost::math::numeric_limits::digits <= 113 ? 113 : -1 > tag_type; diff --git a/include/boost/math/special_functions/detail/bessel_kn.hpp b/include/boost/math/special_functions/detail/bessel_kn.hpp index d0ddcd0db..41becc8aa 100644 --- a/include/boost/math/special_functions/detail/bessel_kn.hpp +++ b/include/boost/math/special_functions/detail/bessel_kn.hpp @@ -10,8 +10,12 @@ #pragma once #endif +#include +#include +#include #include #include +#include #include // Modified Bessel function of the second kind of integer order @@ -20,14 +24,14 @@ namespace boost { namespace math { namespace detail{ template -T bessel_kn(int n, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T bessel_kn(int n, T x, const Policy& pol) { BOOST_MATH_STD_USING T value, current, prev; using namespace boost::math::tools; - static const char* function = "boost::math::bessel_kn<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::bessel_kn<%1%>(%1%,%1%)"; if (x < 0) { diff --git a/include/boost/math/special_functions/detail/bessel_y0.hpp b/include/boost/math/special_functions/detail/bessel_y0.hpp index 1679820d1..f1aea6acb 100644 --- a/include/boost/math/special_functions/detail/bessel_y0.hpp +++ b/include/boost/math/special_functions/detail/bessel_y0.hpp @@ -12,6 +12,7 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include #include #include #include @@ -36,12 +37,12 @@ namespace boost { namespace math { namespace detail{ template -T bessel_y0(T x, const Policy&); +BOOST_MATH_GPU_ENABLED T bessel_y0(T x, const Policy&); template -T bessel_y0(T x, const Policy&) +BOOST_MATH_GPU_ENABLED T bessel_y0(T x, const Policy&) { - static const T P1[] = { + BOOST_MATH_STATIC const T P1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0723538782003176831e+11)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.3716255451260504098e+09)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.0422274357376619816e+08)), @@ -49,7 +50,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0102532948020907590e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8402381979244993524e+01)), }; - static const T Q1[] = { + BOOST_MATH_STATIC const T Q1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.8873865738997033405e+11)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.1617187777290363573e+09)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5662956624278251596e+07)), @@ -57,7 +58,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6475986689240190091e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T P2[] = { + BOOST_MATH_STATIC const T P2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -2.2213976967566192242e+13)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -5.5107435206722644429e+11)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.3600098638603061642e+10)), @@ -66,7 +67,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4566865832663635920e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7427031242901594547e+01)), }; - static const T Q2[] = { + BOOST_MATH_STATIC const T Q2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.3386146580707264428e+14)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4266824419412347550e+12)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.4015103849971240096e+10)), @@ -75,7 +76,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.3030857612070288823e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T P3[] = { + BOOST_MATH_STATIC const T P3[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.0728726905150210443e+15)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.7016641869173237784e+14)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2829912364088687306e+11)), @@ -85,7 +86,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1363534169313901632e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.7439661319197499338e+01)), }; - static const T Q3[] = { + BOOST_MATH_STATIC const T Q3[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.4563724628846457519e+17)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.9272425569640309819e+15)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2598377924042897629e+13)), @@ -95,7 +96,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.7903362168128450017e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T PC[] = { + BOOST_MATH_STATIC const T PC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684302e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1345386639580765797e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1170523380864944322e+04)), @@ -103,7 +104,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5376201909008354296e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.8961548424210455236e-01)), }; - static const T QC[] = { + BOOST_MATH_STATIC const T QC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684318e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1370412495510416640e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1215350561880115730e+04)), @@ -111,7 +112,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5711159858080893649e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T PS[] = { + BOOST_MATH_STATIC const T PS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.9226600200800094098e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8591953644342993800e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1183429920482737611e+02)), @@ -119,7 +120,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2441026745835638459e+00)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -8.8033303048680751817e-03)), }; - static const T QS[] = { + BOOST_MATH_STATIC const T QS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.7105024128512061905e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1951131543434613647e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.2642780169211018836e+03)), @@ -127,7 +128,7 @@ T bessel_y0(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 9.0593769594993125859e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.9357696627916752158e-01)), + BOOST_MATH_STATIC const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.9357696627916752158e-01)), x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.9576784193148578684e+00)), x3 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0860510603017726976e+00)), x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.280e+02)), diff --git a/include/boost/math/special_functions/detail/bessel_y1.hpp b/include/boost/math/special_functions/detail/bessel_y1.hpp index 3ac696bb5..0f0dbdf3b 100644 --- a/include/boost/math/special_functions/detail/bessel_y1.hpp +++ b/include/boost/math/special_functions/detail/bessel_y1.hpp @@ -12,6 +12,7 @@ #pragma warning(disable:4702) // Unreachable code (release mode only warning) #endif +#include #include #include #include @@ -36,12 +37,12 @@ namespace boost { namespace math { namespace detail{ template -T bessel_y1(T x, const Policy&); +BOOST_MATH_GPU_ENABLED T bessel_y1(T x, const Policy&); template -T bessel_y1(T x, const Policy&) +BOOST_MATH_GPU_ENABLED T bessel_y1(T x, const Policy&) { - static const T P1[] = { + BOOST_MATH_STATIC const T P1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.0535726612579544093e+13)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4708611716525426053e+12)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.7595974497819597599e+11)), @@ -50,7 +51,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2157953222280260820e+05)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -3.1714424660046133456e+02)), }; - static const T Q1[] = { + BOOST_MATH_STATIC const T Q1[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.0737873921079286084e+14)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1272286200406461981e+12)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.7800352738690585613e+10)), @@ -59,7 +60,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.2079908168393867438e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T P2[] = { + BOOST_MATH_STATIC const T P2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1514276357909013326e+19)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -5.6808094574724204577e+18)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -2.3638408497043134724e+16)), @@ -70,7 +71,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.9153806858264202986e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2337180442012953128e+03)), }; - static const T Q2[] = { + BOOST_MATH_STATIC const T Q2[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.3321844313316185697e+20)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.6968198822857178911e+18)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.0837179548112881950e+16)), @@ -81,7 +82,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.2855164849321609336e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T PC[] = { + BOOST_MATH_STATIC const T PC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278571e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9422465050776411957e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -6.6033732483649391093e+06)), @@ -90,7 +91,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.6116166443246101165e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)), }; - static const T QC[] = { + BOOST_MATH_STATIC const T QC[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278568e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9341243899345856590e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -6.5853394797230870728e+06)), @@ -99,7 +100,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4550094401904961825e+03)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T PS[] = { + BOOST_MATH_STATIC const T PS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.3220913409857223519e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.5145160675335701966e+04)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6178836581270835179e+04)), @@ -108,7 +109,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 3.5265133846636032186e+01)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)), }; - static const T QS[] = { + BOOST_MATH_STATIC const T QS[] = { static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0871281941028743574e+05)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8194580422439972989e+06)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4194606696037208929e+06)), @@ -117,7 +118,7 @@ T bessel_y1(T x, const Policy&) static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 8.6383677696049909675e+02)), static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)), }; - static const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1971413260310170351e+00)), + BOOST_MATH_STATIC const T x1 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1971413260310170351e+00)), x2 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4296810407941351328e+00)), x11 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 5.620e+02)), x12 = static_cast(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8288260310170351490e-03)), diff --git a/include/boost/math/special_functions/detail/bessel_yn.hpp b/include/boost/math/special_functions/detail/bessel_yn.hpp index 73dee0bbb..a45d1761c 100644 --- a/include/boost/math/special_functions/detail/bessel_yn.hpp +++ b/include/boost/math/special_functions/detail/bessel_yn.hpp @@ -10,9 +10,11 @@ #pragma once #endif +#include #include #include #include +#include #include // Bessel function of the second kind of integer order @@ -21,14 +23,14 @@ namespace boost { namespace math { namespace detail{ template -T bessel_yn(int n, T x, const Policy& pol) +BOOST_MATH_GPU_ENABLED T bessel_yn(int n, T x, const Policy& pol) { BOOST_MATH_STD_USING T value, factor, current, prev; using namespace boost::math::tools; - static const char* function = "boost::math::bessel_yn<%1%>(%1%,%1%)"; + constexpr auto function = "boost::math::bessel_yn<%1%>(%1%,%1%)"; if ((x == 0) && (n == 0)) { diff --git a/include/boost/math/special_functions/detail/iconv.hpp b/include/boost/math/special_functions/detail/iconv.hpp index 90b4aa938..20889d411 100644 --- a/include/boost/math/special_functions/detail/iconv.hpp +++ b/include/boost/math/special_functions/detail/iconv.hpp @@ -10,28 +10,29 @@ #pragma once #endif -#include +#include +#include #include namespace boost { namespace math { namespace detail{ template -inline int iconv_imp(T v, Policy const&, std::true_type const&) +BOOST_MATH_GPU_ENABLED inline int iconv_imp(T v, Policy const&, boost::math::true_type const&) { return static_cast(v); } template -inline int iconv_imp(T v, Policy const& pol, std::false_type const&) +BOOST_MATH_GPU_ENABLED inline int iconv_imp(T v, Policy const& pol, boost::math::false_type const&) { BOOST_MATH_STD_USING return iround(v, pol); } template -inline int iconv(T v, Policy const& pol) +BOOST_MATH_GPU_ENABLED inline int iconv(T v, Policy const& pol) { - typedef typename std::is_convertible::type tag_type; + typedef typename boost::math::is_convertible::type tag_type; return iconv_imp(v, pol, tag_type()); } diff --git a/include/boost/math/special_functions/detail/unchecked_factorial.hpp b/include/boost/math/special_functions/detail/unchecked_factorial.hpp index f7720a2ab..92481f2c6 100644 --- a/include/boost/math/special_functions/detail/unchecked_factorial.hpp +++ b/include/boost/math/special_functions/detail/unchecked_factorial.hpp @@ -10,19 +10,23 @@ #pragma once #endif -#ifdef _MSC_VER -#pragma warning(push) // Temporary until lexical cast fixed. -#pragma warning(disable: 4127 4701) -#endif -#include -#ifdef _MSC_VER -#pragma warning(pop) -#endif -#include +#include +#include +#include +#include #include -#include -#include -#include + +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +# ifdef _MSC_VER +# pragma warning(push) // Temporary until lexical cast fixed. +# pragma warning(disable: 4127 4701) +# endif +# include +# ifdef _MSC_VER +# pragma warning(pop) +# endif +#endif + #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -46,13 +50,21 @@ struct max_factorial; template struct unchecked_factorial_data; +#ifdef BOOST_MATH_HAS_NVRTC + +// Need fwd decl +template +BOOST_MATH_GPU_ENABLED inline T unchecked_factorial(unsigned i); + +#endif + #ifndef BOOST_MATH_HAS_GPU_SUPPORT template struct unchecked_factorial_data { #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - static constexpr std::array factorials = { { + static constexpr boost::math::array factorials = { { 1.0F, 1.0F, 2.0F, @@ -90,15 +102,15 @@ struct unchecked_factorial_data 0.29523279903960414084761860964352e39F, }}; #else - static const std::array factorials; + static const boost::math::array factorials; #endif }; template #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - constexpr std::array unchecked_factorial_data::factorials; + constexpr boost::math::array unchecked_factorial_data::factorials; #else - const std::array unchecked_factorial_data::factorials = {{ + const boost::math::array unchecked_factorial_data::factorials = {{ 1.0F, 1.0F, 2.0F, @@ -204,7 +216,7 @@ template struct unchecked_factorial_data { #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - static constexpr std::array factorials = { { + static constexpr boost::math::array factorials = { { 1.0, 1.0, 2.0, @@ -378,15 +390,15 @@ struct unchecked_factorial_data 0.7257415615307998967396728211129263114717e307, }}; #else - static const std::array factorials; + static const boost::math::array factorials; #endif }; template #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - constexpr std::array unchecked_factorial_data::factorials; + constexpr boost::math::array unchecked_factorial_data::factorials; #else - const std::array unchecked_factorial_data::factorials = {{ + const boost::math::array unchecked_factorial_data::factorials = {{ 1.0, 1.0, 2.0, @@ -633,7 +645,7 @@ template struct unchecked_factorial_data { #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - static constexpr std::array factorials = { { + static constexpr boost::math::array factorials = { { 1L, 1L, 2L, @@ -807,15 +819,15 @@ struct unchecked_factorial_data 0.7257415615307998967396728211129263114717e307L, }}; #else - static const std::array factorials; + static const boost::math::array factorials; #endif }; template #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - constexpr std::array unchecked_factorial_data::factorials; + constexpr boost::math::array unchecked_factorial_data::factorials; #else - const std::array unchecked_factorial_data::factorials = {{ + const boost::math::array unchecked_factorial_data::factorials = {{ 1L, 1L, 2L, @@ -1008,7 +1020,7 @@ template struct unchecked_factorial_data { #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES - static constexpr std::array factorials = { { + static constexpr boost::math::array factorials = { { 1, 1, 2, @@ -1182,15 +1194,15 @@ struct unchecked_factorial_data 0.7257415615307998967396728211129263114717e307Q, } }; #else - static const std::array factorials; + static const boost::math::array factorials; #endif }; template #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES -constexpr std::array unchecked_factorial_data::factorials; +constexpr boost::math::array unchecked_factorial_data::factorials; #else -const std::array unchecked_factorial_data::factorials = { { +const boost::math::array unchecked_factorial_data::factorials = { { 1, 1, 2, @@ -1402,7 +1414,7 @@ const typename unchecked_factorial_initializer::init unchecked_factorial_init template -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant&) +inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant&) { // // If you're foolish enough to instantiate factorial @@ -1416,10 +1428,10 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant(factorial(n)); // See factorial documentation for more detail. // - static_assert(!std::is_integral::value && !std::numeric_limits::is_integer, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value && !boost::math::numeric_limits::is_integer, "Type T must not be an integral type"); // We rely on C++11 thread safe initialization here: - static const std::array factorials = {{ + static const boost::math::array factorials = {{ T(boost::math::tools::convert_from_string("1")), T(boost::math::tools::convert_from_string("1")), T(boost::math::tools::convert_from_string("2")), @@ -1527,7 +1539,7 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant&) +inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant&) { // // If you're foolish enough to instantiate factorial @@ -1541,7 +1553,7 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant(factorial(n)); // See factorial documentation for more detail. // - static_assert(!std::is_integral::value && !std::numeric_limits::is_integer, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value && !boost::math::numeric_limits::is_integer, "Type T must not be an integral type"); static const char* const factorial_strings[] = { "1", @@ -1667,13 +1679,13 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant::digits>&) +BOOST_MATH_GPU_ENABLED inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant::digits>&) { return unchecked_factorial(i); } template -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant::digits>&) +BOOST_MATH_GPU_ENABLED inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant::digits>&) { return unchecked_factorial(i); } @@ -1682,14 +1694,14 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant&) +inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant&) { return unchecked_factorial(i); } #endif #ifdef BOOST_MATH_USE_FLOAT128 template -inline T unchecked_factorial_imp(unsigned i, const std::integral_constant&) +inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant&) { return unchecked_factorial(i); } @@ -1698,14 +1710,14 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant -inline T unchecked_factorial(unsigned i) +BOOST_MATH_GPU_ENABLED inline T unchecked_factorial(unsigned i) { typedef typename boost::math::policies::precision >::type tag_type; return unchecked_factorial_imp(i, tag_type()); } #ifdef BOOST_MATH_USE_FLOAT128 -#define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL : std::numeric_limits::digits == 113 ? max_factorial::value +#define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL : boost::math::numeric_limits::digits == 113 ? max_factorial::value #else #define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL #endif @@ -1714,10 +1726,10 @@ template struct max_factorial { static constexpr unsigned value = - std::numeric_limits::digits == std::numeric_limits::digits ? max_factorial::value - : std::numeric_limits::digits == std::numeric_limits::digits ? max_factorial::value + boost::math::numeric_limits::digits == boost::math::numeric_limits::digits ? max_factorial::value + : boost::math::numeric_limits::digits == boost::math::numeric_limits::digits ? max_factorial::value #ifndef BOOST_MATH_GPU_ENABLED - : std::numeric_limits::digits == std::numeric_limits::digits ? max_factorial::value + : boost::math::numeric_limits::digits == boost::math::numeric_limits::digits ? max_factorial::value BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL #endif : 100; diff --git a/include/boost/math/special_functions/expm1.hpp b/include/boost/math/special_functions/expm1.hpp index b59721e93..5e61ca20b 100644 --- a/include/boost/math/special_functions/expm1.hpp +++ b/include/boost/math/special_functions/expm1.hpp @@ -15,9 +15,6 @@ #ifndef BOOST_MATH_HAS_NVRTC -#include -#include -#include #include #include #include @@ -25,6 +22,9 @@ #include #include #include +#include +#include +#include #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128) // @@ -49,10 +49,10 @@ namespace detail { typedef T result_type; - expm1_series(T x) + BOOST_MATH_GPU_ENABLED expm1_series(T x) : k(0), m_x(x), m_term(1) {} - T operator()() + BOOST_MATH_GPU_ENABLED T operator()() { ++k; m_term *= m_x; @@ -60,7 +60,7 @@ namespace detail return m_term; } - int count()const + BOOST_MATH_GPU_ENABLED int count()const { return k; } @@ -78,26 +78,28 @@ struct expm1_initializer { struct init { - init() + BOOST_MATH_GPU_ENABLED init() { do_init(tag()); } template - static void do_init(const std::integral_constant&){} - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&){} + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { expm1(T(0.5)); } - static void do_init(const std::integral_constant&) + BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant&) { expm1(T(0.5)); } - void force_instantiate()const{} + BOOST_MATH_GPU_ENABLED void force_instantiate()const{} }; - static const init initializer; - static void force_instantiate() + BOOST_MATH_STATIC const init initializer; + BOOST_MATH_GPU_ENABLED static void force_instantiate() { + #ifndef BOOST_MATH_HAS_GPU_SUPPORT initializer.force_instantiate(); + #endif } }; @@ -110,7 +112,7 @@ const typename expm1_initializer::init expm1_initializer |x| > epsilon. // template -T expm1_imp(T x, const std::integral_constant&, const Policy& pol) +T expm1_imp(T x, const boost::math::integral_constant&, const Policy& pol) { BOOST_MATH_STD_USING @@ -132,7 +134,7 @@ T expm1_imp(T x, const std::integral_constant&, const Policy& pol) if(a < tools::epsilon()) return x; detail::expm1_series s(x); - std::uintmax_t max_iter = policies::get_max_series_iterations(); + boost::math::uintmax_t max_iter = policies::get_max_series_iterations(); T result = tools::sum_series(s, policies::get_epsilon(), max_iter); @@ -141,7 +143,7 @@ T expm1_imp(T x, const std::integral_constant&, const Policy& pol) } template -T expm1_imp(T x, const std::integral_constant&, const P& pol) +BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant&, const P& pol) { BOOST_MATH_STD_USING @@ -159,16 +161,16 @@ T expm1_imp(T x, const std::integral_constant&, const P& pol) if(a < tools::epsilon()) return x; - static const float Y = 0.10281276702880859e1f; - static const T n[] = { static_cast(-0.28127670288085937e-1), static_cast(0.51278186299064534e0), static_cast(-0.6310029069350198e-1), static_cast(0.11638457975729296e-1), static_cast(-0.52143390687521003e-3), static_cast(0.21491399776965688e-4) }; - static const T d[] = { 1, static_cast(-0.45442309511354755e0), static_cast(0.90850389570911714e-1), static_cast(-0.10088963629815502e-1), static_cast(0.63003407478692265e-3), static_cast(-0.17976570003654402e-4) }; + BOOST_MATH_STATIC const float Y = 0.10281276702880859e1f; + BOOST_MATH_STATIC const T n[] = { static_cast(-0.28127670288085937e-1), static_cast(0.51278186299064534e0), static_cast(-0.6310029069350198e-1), static_cast(0.11638457975729296e-1), static_cast(-0.52143390687521003e-3), static_cast(0.21491399776965688e-4) }; + BOOST_MATH_STATIC const T d[] = { 1, static_cast(-0.45442309511354755e0), static_cast(0.90850389570911714e-1), static_cast(-0.10088963629815502e-1), static_cast(0.63003407478692265e-3), static_cast(-0.17976570003654402e-4) }; T result = x * Y + x * tools::evaluate_polynomial(n, x) / tools::evaluate_polynomial(d, x); return result; } template -T expm1_imp(T x, const std::integral_constant&, const P& pol) +BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant&, const P& pol) { BOOST_MATH_STD_USING @@ -186,8 +188,8 @@ T expm1_imp(T x, const std::integral_constant&, const P& pol) if(a < tools::epsilon()) return x; - static const float Y = 0.10281276702880859375e1f; - static const T n[] = { + BOOST_MATH_STATIC const float Y = 0.10281276702880859375e1f; + BOOST_MATH_STATIC const T n[] = { BOOST_MATH_BIG_CONSTANT(T, 64, -0.281276702880859375e-1), BOOST_MATH_BIG_CONSTANT(T, 64, 0.512980290285154286358e0), BOOST_MATH_BIG_CONSTANT(T, 64, -0.667758794592881019644e-1), @@ -196,7 +198,7 @@ T expm1_imp(T x, const std::integral_constant&, const P& pol) BOOST_MATH_BIG_CONSTANT(T, 64, 0.447441185192951335042e-4), BOOST_MATH_BIG_CONSTANT(T, 64, -0.714539134024984593011e-6) }; - static const T d[] = { + BOOST_MATH_STATIC const T d[] = { BOOST_MATH_BIG_CONSTANT(T, 64, 1.0), BOOST_MATH_BIG_CONSTANT(T, 64, -0.461477618025562520389e0), BOOST_MATH_BIG_CONSTANT(T, 64, 0.961237488025708540713e-1), @@ -211,7 +213,7 @@ T expm1_imp(T x, const std::integral_constant&, const P& pol) } template -T expm1_imp(T x, const std::integral_constant&, const P& pol) +BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant&, const P& pol) { BOOST_MATH_STD_USING @@ -263,7 +265,7 @@ T expm1_imp(T x, const std::integral_constant&, const P& pol) } // namespace detail template -inline typename tools::promote_args::type expm1(T x, const Policy& /* pol */) +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type expm1(T x, const Policy& /* pol */) { typedef typename tools::promote_args::type result_type; typedef typename policies::evaluation::type value_type; @@ -275,7 +277,7 @@ inline typename tools::promote_args::type expm1(T x, const Policy& /* pol */) policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - typedef std::integral_constant +#include +#include +#include +#include #include #include -#include +#include + #ifdef _MSC_VER #pragma warning(push) // Temporary until lexical cast fixed. #pragma warning(disable: 4127 4701) @@ -21,16 +25,14 @@ #ifdef _MSC_VER #pragma warning(pop) #endif -#include -#include namespace boost { namespace math { template -inline T factorial(unsigned i, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T factorial(unsigned i, const Policy& pol) { - static_assert(!std::is_integral::value, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value, "Type T must not be an integral type"); // factorial(n) is not implemented // because it would overflow integral type T for too small n // to be useful. Use instead a floating-point type, @@ -49,7 +51,7 @@ inline T factorial(unsigned i, const Policy& pol) } template -inline T factorial(unsigned i) +BOOST_MATH_GPU_ENABLED inline T factorial(unsigned i) { return factorial(i, policies::policy<>()); } @@ -72,9 +74,9 @@ inline double factorial(unsigned i) } */ template -T double_factorial(unsigned i, const Policy& pol) +BOOST_MATH_GPU_ENABLED T double_factorial(unsigned i, const Policy& pol) { - static_assert(!std::is_integral::value, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value, "Type T must not be an integral type"); BOOST_MATH_STD_USING // ADL lookup of std names if(i & 1) { @@ -107,17 +109,20 @@ T double_factorial(unsigned i, const Policy& pol) } template -inline T double_factorial(unsigned i) +BOOST_MATH_GPU_ENABLED inline T double_factorial(unsigned i) { return double_factorial(i, policies::policy<>()); } +// TODO(mborland): We do not currently have support for tgamma_delta_ratio +#ifndef BOOST_MATH_HAS_GPU_SUPPORT + namespace detail{ template T rising_factorial_imp(T x, int n, const Policy& pol) { - static_assert(!std::is_integral::value, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value, "Type T must not be an integral type"); if(x < 0) { // @@ -165,7 +170,7 @@ T rising_factorial_imp(T x, int n, const Policy& pol) template inline T falling_factorial_imp(T x, unsigned n, const Policy& pol) { - static_assert(!std::is_integral::value, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value, "Type T must not be an integral type"); BOOST_MATH_STD_USING // ADL of std names if(x == 0) return 0; @@ -262,6 +267,8 @@ inline typename tools::promote_args::type static_cast(x), n, pol); } +#endif // BOOST_MATH_HAS_GPU_SUPPORT + } // namespace math } // namespace boost diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index afb8e9728..9268ba415 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -2287,6 +2287,7 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t #else #include +#include namespace boost { namespace math { @@ -2295,7 +2296,7 @@ inline BOOST_MATH_GPU_ENABLED float tgamma(float x) { return ::tgammaf(x); } inline BOOST_MATH_GPU_ENABLED double tgamma(double x) { return ::tgamma(x); } template -inline BOOST_MATH_GPU_ENABLED T tgamma(T x, const Policy&) +BOOST_MATH_GPU_ENABLED T tgamma(T x, const Policy&) { return boost::math::tgamma(x); } @@ -2304,11 +2305,49 @@ inline BOOST_MATH_GPU_ENABLED float lgamma(float x) { return ::lgammaf(x); } inline BOOST_MATH_GPU_ENABLED double lgamma(double x) { return ::lgamma(x); } template -inline BOOST_MATH_GPU_ENABLED T lgamma(T x, const Policy&) +BOOST_MATH_GPU_ENABLED T lgamma(T x, const Policy&) { return boost::math::lgamma(x); } +template +BOOST_MATH_GPU_ENABLED T lgamma(T x, int* sign, const Policy&) +{ + auto res = boost::math::lgamma(x); + if (sign != nullptr) + { + if (res < 0) + { + *sign = -1; + } + else + { + *sign = 1; + } + } + + return res; +} + +template +BOOST_MATH_GPU_ENABLED T tgamma1pm1(T z) +{ + using namespace boost::math; + + if (fabs(z) < T(0.55)) + { + return expm1(lgamma(z)); + } + + return expm1(lgamma(1 + z)); +} + +template +BOOST_MATH_GPU_ENABLED T tgamma1pm1(T x, const Policy&) +{ + return tgamma1pm1(x); +} + } // namespace math } // namespace boost diff --git a/include/boost/math/special_functions/hypot.hpp b/include/boost/math/special_functions/hypot.hpp index c56c75110..f38e37e87 100644 --- a/include/boost/math/special_functions/hypot.hpp +++ b/include/boost/math/special_functions/hypot.hpp @@ -12,20 +12,20 @@ #include #include +#include +#include #include #include -#include // for swap -#include namespace boost{ namespace math{ namespace detail{ template -T hypot_imp(T x, T y, const Policy& pol) +BOOST_MATH_GPU_ENABLED T hypot_imp(T x, T y, const Policy& pol) { // // Normalize x and y, so that both are positive and x >= y: // - using std::fabs; using std::sqrt; // ADL of std names + BOOST_MATH_STD_USING x = fabs(x); y = fabs(y); @@ -35,16 +35,16 @@ T hypot_imp(T x, T y, const Policy& pol) #pragma warning(disable: 4127) #endif // special case, see C99 Annex F: - if(std::numeric_limits::has_infinity - && ((x == std::numeric_limits::infinity()) - || (y == std::numeric_limits::infinity()))) + if(boost::math::numeric_limits::has_infinity + && ((x == boost::math::numeric_limits::infinity()) + || (y == boost::math::numeric_limits::infinity()))) return policies::raise_overflow_error("boost::math::hypot<%1%>(%1%,%1%)", nullptr, pol); #ifdef _MSC_VER #pragma warning(pop) #endif if(y > x) - (std::swap)(x, y); + BOOST_MATH_GPU_SAFE_SWAP(x, y); if(x * tools::epsilon() >= y) return x; @@ -56,7 +56,7 @@ T hypot_imp(T x, T y, const Policy& pol) } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type hypot(T1 x, T2 y) { typedef typename tools::promote_args::type result_type; @@ -65,7 +65,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type hypot(T1 x, T2 y, const Policy& pol) { typedef typename tools::promote_args::type result_type; diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 289b27592..21f51e507 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -24,12 +24,16 @@ #pragma once #endif +#include + +#ifndef BOOST_MATH_HAS_NVRTC + #include #include #include -#include #include #include // for argument promotion. +#include #include #define BOOST_NO_MACRO_EXPAND /**/ @@ -420,15 +424,15 @@ namespace boost template struct max_factorial; template - RT factorial(unsigned int); + BOOST_MATH_GPU_ENABLED RT factorial(unsigned int); template - RT factorial(unsigned int, const Policy& pol); + BOOST_MATH_GPU_ENABLED RT factorial(unsigned int, const Policy& pol); template BOOST_MATH_GPU_ENABLED RT unchecked_factorial(unsigned int BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(RT)); template - RT double_factorial(unsigned i); + BOOST_MATH_GPU_ENABLED RT double_factorial(unsigned i); template - RT double_factorial(unsigned i, const Policy& pol); + BOOST_MATH_GPU_ENABLED RT double_factorial(unsigned i, const Policy& pol); template tools::promote_args_t falling_factorial(RT x, unsigned n); @@ -554,11 +558,11 @@ namespace boost // Hypotenuse function sqrt(x ^ 2 + y ^ 2). template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t hypot(T1 x, T2 y); template - tools::promote_args_t + BOOST_MATH_GPU_ENABLED tools::promote_args_t hypot(T1 x, T2 y, const Policy&); // cbrt - cube root. @@ -607,10 +611,10 @@ namespace boost // sinus cardinals: template - tools::promote_args_t sinc_pi(T x); + BOOST_MATH_GPU_ENABLED tools::promote_args_t sinc_pi(T x); template - tools::promote_args_t sinc_pi(T x, const Policy&); + BOOST_MATH_GPU_ENABLED tools::promote_args_t sinc_pi(T x, const Policy&); template tools::promote_args_t sinhc_pi(T x); @@ -639,36 +643,36 @@ namespace boost namespace detail{ - typedef std::integral_constant bessel_no_int_tag; // No integer optimisation possible. - typedef std::integral_constant bessel_maybe_int_tag; // Maybe integer optimisation. - typedef std::integral_constant bessel_int_tag; // Definite integer optimisation. + typedef boost::math::integral_constant bessel_no_int_tag; // No integer optimisation possible. + typedef boost::math::integral_constant bessel_maybe_int_tag; // Maybe integer optimisation. + typedef boost::math::integral_constant bessel_int_tag; // Definite integer optimisation. template struct bessel_traits { - using result_type = typename std::conditional< - std::is_integral::value, + using result_type = typename boost::math::conditional< + boost::math::is_integral::value, typename tools::promote_args::type, tools::promote_args_t >::type; typedef typename policies::precision::type precision_type; - using optimisation_tag = typename std::conditional< + using optimisation_tag = typename boost::math::conditional< (precision_type::value <= 0 || precision_type::value > 64), bessel_no_int_tag, - typename std::conditional< - std::is_integral::value, + typename boost::math::conditional< + boost::math::is_integral::value, bessel_int_tag, bessel_maybe_int_tag >::type >::type; - using optimisation_tag128 = typename std::conditional< + using optimisation_tag128 = typename boost::math::conditional< (precision_type::value <= 0 || precision_type::value > 113), bessel_no_int_tag, - typename std::conditional< - std::is_integral::value, + typename boost::math::conditional< + boost::math::is_integral::value, bessel_int_tag, bessel_maybe_int_tag >::type @@ -678,98 +682,98 @@ namespace boost // Bessel functions: template - typename detail::bessel_traits::result_type cyl_bessel_j(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_j(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits::result_type cyl_bessel_j_prime(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_j_prime(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_bessel_j(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_j(T1 v, T2 x); template - typename detail::bessel_traits >::result_type cyl_bessel_j_prime(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_j_prime(T1 v, T2 x); template - typename detail::bessel_traits::result_type sph_bessel(unsigned v, T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type sph_bessel(unsigned v, T x, const Policy& pol); template - typename detail::bessel_traits::result_type sph_bessel_prime(unsigned v, T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type sph_bessel_prime(unsigned v, T x, const Policy& pol); template - typename detail::bessel_traits >::result_type sph_bessel(unsigned v, T x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type sph_bessel(unsigned v, T x); template - typename detail::bessel_traits >::result_type sph_bessel_prime(unsigned v, T x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type sph_bessel_prime(unsigned v, T x); template - typename detail::bessel_traits::result_type cyl_bessel_i(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_i(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits::result_type cyl_bessel_i_prime(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_i_prime(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_bessel_i(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_i(T1 v, T2 x); template - typename detail::bessel_traits >::result_type cyl_bessel_i_prime(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_i_prime(T1 v, T2 x); template - typename detail::bessel_traits::result_type cyl_bessel_k(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_k(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits::result_type cyl_bessel_k_prime(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_k_prime(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_bessel_k(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_k(T1 v, T2 x); template - typename detail::bessel_traits >::result_type cyl_bessel_k_prime(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_k_prime(T1 v, T2 x); template - typename detail::bessel_traits::result_type cyl_neumann(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_neumann(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits::result_type cyl_neumann_prime(T1 v, T2 x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_neumann_prime(T1 v, T2 x, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_neumann(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_neumann(T1 v, T2 x); template - typename detail::bessel_traits >::result_type cyl_neumann_prime(T1 v, T2 x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_neumann_prime(T1 v, T2 x); template - typename detail::bessel_traits::result_type sph_neumann(unsigned v, T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type sph_neumann(unsigned v, T x, const Policy& pol); template - typename detail::bessel_traits::result_type sph_neumann_prime(unsigned v, T x, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type sph_neumann_prime(unsigned v, T x, const Policy& pol); template - typename detail::bessel_traits >::result_type sph_neumann(unsigned v, T x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type sph_neumann(unsigned v, T x); template - typename detail::bessel_traits >::result_type sph_neumann_prime(unsigned v, T x); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type sph_neumann_prime(unsigned v, T x); template - typename detail::bessel_traits::result_type cyl_bessel_j_zero(T v, int m, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_bessel_j_zero(T v, int m, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_bessel_j_zero(T v, int m); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_bessel_j_zero(T v, int m); template - OutputIterator cyl_bessel_j_zero(T v, + BOOST_MATH_GPU_ENABLED OutputIterator cyl_bessel_j_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it); template - OutputIterator cyl_bessel_j_zero(T v, + BOOST_MATH_GPU_ENABLED OutputIterator cyl_bessel_j_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it, const Policy&); template - typename detail::bessel_traits::result_type cyl_neumann_zero(T v, int m, const Policy& pol); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits::result_type cyl_neumann_zero(T v, int m, const Policy& pol); template - typename detail::bessel_traits >::result_type cyl_neumann_zero(T v, int m); + BOOST_MATH_GPU_ENABLED typename detail::bessel_traits >::result_type cyl_neumann_zero(T v, int m); template - OutputIterator cyl_neumann_zero(T v, + BOOST_MATH_GPU_ENABLED OutputIterator cyl_neumann_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it); template - OutputIterator cyl_neumann_zero(T v, + BOOST_MATH_GPU_ENABLED OutputIterator cyl_neumann_zero(T v, int start_index, unsigned number_of_zeros, OutputIterator out_it, @@ -1400,10 +1404,10 @@ namespace boost \ using boost::math::max_factorial;\ template \ - inline RT factorial(unsigned int i) { return boost::math::factorial(i, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline RT factorial(unsigned int i) { return boost::math::factorial(i, Policy()); }\ using boost::math::unchecked_factorial;\ template \ - inline RT double_factorial(unsigned i){ return boost::math::double_factorial(i, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline RT double_factorial(unsigned i){ return boost::math::double_factorial(i, Policy()); }\ template \ inline boost::math::tools::promote_args_t falling_factorial(RT x, unsigned n){ return boost::math::falling_factorial(x, n, Policy()); }\ template \ @@ -1465,7 +1469,7 @@ namespace boost \ template \ inline boost::math::tools::promote_args_t \ - hypot(T1 x, T2 y){ return boost::math::hypot(x, y, Policy()); }\ + BOOST_MATH_GPU_ENABLED hypot(T1 x, T2 y){ return boost::math::hypot(x, y, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t cbrt(RT z){ return boost::math::cbrt(z, Policy()); }\ @@ -1487,7 +1491,7 @@ namespace boost BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t sqrt1pm1(const T& val){ return boost::math::sqrt1pm1(val, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t sinc_pi(T x){ return boost::math::sinc_pi(x, Policy()); }\ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t sinc_pi(T x){ return boost::math::sinc_pi(x, Policy()); }\ \ template \ inline boost::math::tools::promote_args_t sinhc_pi(T x){ return boost::math::sinhc_pi(x, Policy()); }\ @@ -1817,6 +1821,6 @@ template \ - +#endif // BOOST_MATH_HAS_NVRTC #endif // BOOST_MATH_SPECIAL_MATH_FWD_HPP diff --git a/include/boost/math/special_functions/next.hpp b/include/boost/math/special_functions/next.hpp index 02a208e4e..fd08162f9 100644 --- a/include/boost/math/special_functions/next.hpp +++ b/include/boost/math/special_functions/next.hpp @@ -10,6 +10,11 @@ #pragma once #endif +#include + +// TODO(mborland): Need to remove recurrsion from these algos +#ifndef BOOST_MATH_HAS_NVRTC + #include #include #include @@ -920,4 +925,6 @@ inline typename tools::promote_args::type float_advance(const T& val, int dis }} // boost math namespaces +#endif + #endif // BOOST_MATH_SPECIAL_NEXT_HPP diff --git a/include/boost/math/special_functions/round.hpp b/include/boost/math/special_functions/round.hpp index 7a76cd32f..bb99da7e3 100644 --- a/include/boost/math/special_functions/round.hpp +++ b/include/boost/math/special_functions/round.hpp @@ -273,6 +273,30 @@ BOOST_MATH_GPU_ENABLED float round(float x, const Policy&) return ::roundf(x); } +template +BOOST_MATH_GPU_ENABLED int iround(T x) +{ + return static_cast(::lround(x)); +} + +template <> +BOOST_MATH_GPU_ENABLED int iround(float x) +{ + return static_cast(::lroundf(x)); +} + +template +BOOST_MATH_GPU_ENABLED int iround(T x, const Policy&) +{ + return static_cast(::lround(x)); +} + +template +BOOST_MATH_GPU_ENABLED int iround(float x, const Policy&) +{ + return static_cast(::lroundf(x)); +} + template BOOST_MATH_GPU_ENABLED long lround(T x) { diff --git a/include/boost/math/special_functions/sinc.hpp b/include/boost/math/special_functions/sinc.hpp index ff1b2e966..0c18ac346 100644 --- a/include/boost/math/special_functions/sinc.hpp +++ b/include/boost/math/special_functions/sinc.hpp @@ -17,13 +17,13 @@ #include #include +#include #include -#include #include -#include -#include -#include -#include + +#ifndef BOOST_MATH_HAS_NVRTC +#include +#endif // These are the the "Sinus Cardinal" functions. @@ -36,7 +36,7 @@ namespace boost // This is the "Sinus Cardinal" of index Pi. template - inline T sinc_pi_imp(const T x) + BOOST_MATH_GPU_ENABLED inline T sinc_pi_imp(const T x) { BOOST_MATH_STD_USING @@ -44,7 +44,7 @@ namespace boost { return 0; } - else if (abs(x) >= 3.3 * tools::forth_root_epsilon()) + else if (abs(x) >= T(3.3) * tools::forth_root_epsilon()) { return(sin(x)/x); } @@ -58,24 +58,23 @@ namespace boost } // namespace detail template - inline typename tools::promote_args::type sinc_pi(T x) + BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type sinc_pi(T x) { typedef typename tools::promote_args::type result_type; return detail::sinc_pi_imp(static_cast(x)); } template - inline typename tools::promote_args::type sinc_pi(T x, const Policy&) + BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type sinc_pi(T x, const Policy&) { typedef typename tools::promote_args::type result_type; return detail::sinc_pi_imp(static_cast(x)); } template class U> - inline U sinc_pi(const U x) + BOOST_MATH_GPU_ENABLED inline U sinc_pi(const U x) { BOOST_MATH_STD_USING - using ::std::numeric_limits; T const taylor_0_bound = tools::epsilon(); T const taylor_2_bound = tools::root_epsilon(); @@ -88,11 +87,11 @@ namespace boost else { // approximation by taylor series in x at 0 up to order 0 -#ifdef __MWERKS__ + #ifdef __MWERKS__ U result = static_cast >(1); -#else + #else U result = U(1); -#endif + #endif if (abs(x) >= taylor_0_bound) { @@ -113,7 +112,7 @@ namespace boost } template class U, class Policy> - inline U sinc_pi(const U x, const Policy&) + BOOST_MATH_GPU_ENABLED inline U sinc_pi(const U x, const Policy&) { return sinc_pi(x); } diff --git a/include/boost/math/special_functions/ulp.hpp b/include/boost/math/special_functions/ulp.hpp index 3c0616db0..5d1617ace 100644 --- a/include/boost/math/special_functions/ulp.hpp +++ b/include/boost/math/special_functions/ulp.hpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace boost{ namespace math{ namespace detail{ diff --git a/include/boost/math/tools/array.hpp b/include/boost/math/tools/array.hpp new file mode 100644 index 000000000..23e666673 --- /dev/null +++ b/include/boost/math/tools/array.hpp @@ -0,0 +1,41 @@ +// Copyright (c) 2024 Matt Borland +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Regular use of std::array functions can not be used on +// GPU platforms like CUDA since they are missing the __device__ marker +// Alias as needed to get correct support + +#ifndef BOOST_MATH_TOOLS_ARRAY_HPP +#define BOOST_MATH_TOOLS_ARRAY_HPP + +#include + +#ifdef BOOST_MATH_ENABLE_CUDA + +#include + +namespace boost { +namespace math { + +using cuda::std::array; + +} // namespace math +} // namespace boost + +#else + +#include + +namespace boost { +namespace math { + +using std::array; + +} // namespace math +} // namespace boost + +#endif // BOOST_MATH_ENABLE_CUDA + +#endif // BOOST_MATH_TOOLS_ARRAY_HPP diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index 2736d660f..1f444c004 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -676,6 +676,7 @@ namespace boost{ namespace math{ #include #include #include +#include # define BOOST_MATH_CUDA_ENABLED __host__ __device__ # define BOOST_MATH_HAS_GPU_SUPPORT @@ -733,7 +734,7 @@ BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b; template BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_min(const T& a, const T& b) { return a < b ? a : b; } template -BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { return a > b ? a : b; } +BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return a > b ? a : b; } #define BOOST_MATH_GPU_SAFE_SWAP(a, b) gpu_safe_swap(a, b) #define BOOST_MATH_GPU_SAFE_MIN(a, b) gpu_safe_min(a, b) @@ -794,10 +795,13 @@ BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { retur #define BOOST_MATH_NOEXCEPT(T) noexcept(boost::math::is_floating_point_v) #define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T) #define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T) +#define BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(T) #define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast(V) #define BOOST_MATH_FORCEINLINE __forceinline__ #define BOOST_MATH_STD_USING #define BOOST_MATH_IF_CONSTEXPR if constexpr +#define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point::value) +#define BOOST_MATH_CONSTEXPR_TABLE_FUNCTION constexpr // This should be defined to nothing but since it is not specifically a math macro // we need to undef before proceeding @@ -829,6 +833,9 @@ BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b; # define BOOST_MATH_INLINE_CONSTEXPR constexpr #endif +#define BOOST_MATH_INSTRUMENT_VARIABLE(x) +#define BOOST_MATH_INSTRUMENT_CODE(x) + #endif // NVRTC #endif // BOOST_MATH_TOOLS_CONFIG_HPP diff --git a/include/boost/math/tools/roots.hpp b/include/boost/math/tools/roots.hpp index 8f36aa22d..b0b0fc246 100644 --- a/include/boost/math/tools/roots.hpp +++ b/include/boost/math/tools/roots.hpp @@ -1,4 +1,5 @@ // (C) Copyright John Maddock 2006. +// (C) Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -11,22 +12,19 @@ #endif #include - -#ifndef BOOST_MATH_HAS_NVRTC // Disabled for now - #include // test for multiprecision types in complex Newton - -#include -#include -#include -#include - -#include - +#include +#include +#include +#include #include +#include +#include + +#ifndef BOOST_MATH_HAS_GPU_SUPPORT #include #include -#include +#endif namespace boost { namespace math { @@ -37,11 +35,11 @@ namespace detail { namespace dummy { template - typename T::value_type get(const T&) BOOST_MATH_NOEXCEPT(T); + BOOST_MATH_GPU_ENABLED typename T::value_type get(const T&) BOOST_MATH_NOEXCEPT(T); } template -void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T) { using dummy::get; // Use ADL to find the right overload for get: @@ -49,7 +47,7 @@ void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T) b = get<1>(t); } template -void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T) { using dummy::get; // Use ADL to find the right overload for get: @@ -59,7 +57,7 @@ void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T) } template -inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T) { using dummy::get; // Rely on ADL to find the correct overload of get: @@ -67,26 +65,30 @@ inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T) } template -inline void unpack_tuple(const std::pair& p, V& a, V& b) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED inline void unpack_tuple(const boost::math::pair& p, V& a, V& b) BOOST_MATH_NOEXCEPT(T) { a = p.first; b = p.second; } template -inline void unpack_0(const std::pair& p, V& a) BOOST_MATH_NOEXCEPT(T) +BOOST_MATH_GPU_ENABLED inline void unpack_0(const boost::math::pair& p, V& a) BOOST_MATH_NOEXCEPT(T) { a = p.first; } template -void handle_zero_derivative(F f, +BOOST_MATH_GPU_ENABLED void handle_zero_derivative(F f, T& last_f0, const T& f0, T& delta, T& result, T& guess, const T& min, - const T& max) noexcept(BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) + const T& max) noexcept(BOOST_MATH_IS_FLOAT(T) + #ifndef BOOST_MATH_HAS_GPU_SUPPORT + && noexcept(std::declval()(std::declval())) + #endif + ) { if (last_f0 == 0) { @@ -132,25 +134,29 @@ void handle_zero_derivative(F f, } // namespace template -std::pair bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter, const Policy& pol) noexcept(policies::is_noexcept_error_policy::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) +BOOST_MATH_GPU_ENABLED boost::math::pair bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter, const Policy& pol) noexcept(policies::is_noexcept_error_policy::value && BOOST_MATH_IS_FLOAT(T) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()(std::declval())) +#endif +) { T fmin = f(min); T fmax = f(max); if (fmin == 0) { max_iter = 2; - return std::make_pair(min, min); + return boost::math::make_pair(min, min); } if (fmax == 0) { max_iter = 2; - return std::make_pair(max, max); + return boost::math::make_pair(max, max); } // // Error checking: // - static const char* function = "boost::math::tools::bisect<%1%>"; + constexpr auto function = "boost::math::tools::bisect<%1%>"; if (min >= max) { return boost::math::detail::pair_from_single(policies::raise_evaluation_error(function, @@ -200,29 +206,41 @@ std::pair bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter, con std::cout << "Bisection required " << max_iter << " iterations.\n"; #endif - return std::make_pair(min, max); + return boost::math::make_pair(min, max); } template -inline std::pair bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) +BOOST_MATH_GPU_ENABLED inline boost::math::pair bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()(std::declval())) +#endif +) { return bisect(f, min, max, tol, max_iter, policies::policy<>()); } template -inline std::pair bisect(F f, T min, T max, Tol tol) noexcept(policies::is_noexcept_error_policy >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) +BOOST_MATH_GPU_ENABLED inline boost::math::pair bisect(F f, T min, T max, Tol tol) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()(std::declval())) +#endif +) { - std::uintmax_t m = (std::numeric_limits::max)(); + boost::math::uintmax_t m = (boost::math::numeric_limits::max)(); return bisect(f, min, max, tol, m, policies::policy<>()); } template -T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) +BOOST_MATH_GPU_ENABLED T newton_raphson_iterate(F f, T guess, T min, T max, int digits, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()(std::declval())) +#endif +) { BOOST_MATH_STD_USING - static const char* function = "boost::math::tools::newton_raphson_iterate<%1%>"; + constexpr auto function = "boost::math::tools::newton_raphson_iterate<%1%>"; if (min > max) { return policies::raise_evaluation_error(function, "Range arguments in wrong order in boost::math::tools::newton_raphson_iterate(first arg=%1%)", min, boost::math::policies::policy<>()); @@ -249,7 +267,7 @@ T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& T max_range_f = 0; T min_range_f = 0; - std::uintmax_t count(max_iter); + boost::math::uintmax_t count(max_iter); #ifdef BOOST_MATH_INSTRUMENT std::cout << "Newton_raphson_iterate, guess = " << guess << ", min = " << min << ", max = " << max @@ -336,12 +354,22 @@ T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& } template -inline T newton_raphson_iterate(F f, T guess, T min, T max, int digits) noexcept(policies::is_noexcept_error_policy >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval()(std::declval()))) +BOOST_MATH_GPU_ENABLED inline T newton_raphson_iterate(F f, T guess, T min, T max, int digits) noexcept(policies::is_noexcept_error_policy >::value && BOOST_MATH_IS_FLOAT(T) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()(std::declval())) +#endif +) { - std::uintmax_t m = (std::numeric_limits::max)(); + boost::math::uintmax_t m = (boost::math::numeric_limits::max)(); return newton_raphson_iterate(f, guess, min, max, digits, m); } +// TODO(mborland): Disabled for now +// Recursion needs to be removed, but there is no demand at this time +#ifdef BOOST_MATH_HAS_NVRTC +}}} // Namespaces +#else + namespace detail { struct halley_step diff --git a/include/boost/math/tools/series.hpp b/include/boost/math/tools/series.hpp index a4b5cc626..50f2828bb 100644 --- a/include/boost/math/tools/series.hpp +++ b/include/boost/math/tools/series.hpp @@ -10,10 +10,10 @@ #pragma once #endif -#include -#include -#include + #include +#include +#include namespace boost{ namespace math{ namespace tools{ @@ -21,13 +21,17 @@ namespace boost{ namespace math{ namespace tools{ // Simple series summation come first: // template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms, const V& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms, const V& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; - std::uintmax_t counter = max_terms; + boost::math::uintmax_t counter = max_terms; result_type result = init_value; result_type next_term; @@ -44,14 +48,22 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { typename Functor::result_type init_value = 0; return sum_series(func, factor, max_terms, init_value); } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, std::uintmax_t& max_terms, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; @@ -60,17 +72,25 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; - std::uintmax_t iters = (std::numeric_limits::max)(); + boost::math::uintmax_t iters = (boost::math::numeric_limits::max)(); result_type init_val = 0; return sum_series(func, bits, iters, init_val); } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; @@ -79,23 +99,31 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING - std::uintmax_t iters = (std::numeric_limits::max)(); + boost::math::uintmax_t iters = (boost::math::numeric_limits::max)(); return sum_series(func, bits, iters, init_value); } // // Checked summation: // template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms, const V& init_value, V& norm) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms, const V& init_value, V& norm) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; - std::uintmax_t counter = max_terms; + boost::math::uintmax_t counter = max_terms; result_type result = init_value; result_type next_term; @@ -125,7 +153,11 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(F // in any case the result is still much better than a naive summation. // template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING @@ -148,13 +180,17 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Fun } template -BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval()())) +BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) +#ifndef BOOST_MATH_HAS_GPU_SUPPORT +&& noexcept(std::declval()()) +#endif +) { BOOST_MATH_STD_USING typedef typename Functor::result_type result_type; - std::uintmax_t counter = max_terms; + boost::math::uintmax_t counter = max_terms; result_type factor = ldexp(result_type(1), bits); result_type result = func(); diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 44e818c58..64fac76fb 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -17,12 +17,14 @@ run test_arcsine_pdf_float.cu ; run test_arcsine_quan_double.cu ; run test_arcsine_quan_float.cu ; run test_arcsine_range_support_double.cu ; + run test_bernoulli_cdf_double.cu ; run test_bernoulli_cdf_float.cu ; run test_bernoulli_pdf_double.cu ; run test_bernoulli_pdf_float.cu ; run test_bernoulli_range_support_double.cu ; run test_bernoulli_range_support_float.cu ; + run test_cauchy_cdf_double.cu ; run test_cauchy_cdf_float.cu ; run test_cauchy_pdf_double.cu ; @@ -31,6 +33,7 @@ run test_cauchy_quan_double.cu ; run test_cauchy_quan_float.cu ; run test_cauchy_range_support_double.cu ; run test_cauchy_range_support_float.cu ; + run test_exponential_cdf_double.cu ; run test_exponential_cdf_float.cu ; run test_exponential_pdf_double.cu ; @@ -39,40 +42,47 @@ run test_exponential_quan_double.cu ; run test_exponential_quan_float.cu ; run test_exponential_range_support_double.cu ; run test_exponential_range_support_float.cu ; + run test_extreme_value_cdf_double.cu ; run test_extreme_value_cdf_float.cu ; run test_extreme_value_pdf_double.cu ; run test_extreme_value_pdf_float.cu ; run test_extreme_value_quan_double.cu ; run test_extreme_value_quan_float.cu ; + run test_holtsmark_cdf_double.cu ; run test_holtsmark_cdf_float.cu ; run test_holtsmark_pdf_double.cu ; run test_holtsmark_pdf_float.cu ; + run test_landau_cdf_double.cu ; run test_landau_cdf_float.cu ; run test_landau_pdf_double.cu ; run test_landau_pdf_float.cu ; run test_landau_quan_double.cu; run test_landau_quan_float.cu ; + run test_laplace_cdf_double.cu ; run test_laplace_cdf_float.cu ; run test_laplace_pdf_double.cu ; run test_laplace_pdf_float.cu ; run test_laplace_quan_double.cu ; run test_laplace_quan_float.cu ; + run test_logistic_cdf_double.cu ; run test_logistic_cdf_float.cu ; run test_logistic_pdf_double.cu ; run test_logistic_pdf_float.cu ; run test_logistic_quan_double.cu ; run test_logistic_quan_float.cu ; + run test_mapairy_cdf_double.cu ; run test_mapairy_cdf_float.cu ; run test_mapairy_pdf_double.cu ; run test_mapairy_pdf_float.cu ; run test_mapairy_quan_double.cu ; run test_mapairy_quan_float.cu ; + run test_saspoint5_cdf_double.cu ; run test_saspoint5_cdf_float.cu ; run test_saspoint5_pdf_double.cu ; @@ -81,17 +91,52 @@ run test_saspoint5_quan_double.cu ; run test_saspoint5_quan_float.cu ; # Special Functions -# run test_beta_simple.cpp ; run test_beta_double.cu ; run test_beta_float.cu ; + +run test_bessel_i0_double.cu ; +run test_bessel_i0_float.cu ; +run test_bessel_i1_double.cu ; +run test_bessel_i1_float.cu ; +run test_bessel_j0_double.cu ; +run test_bessel_j0_float.cu ; +run test_bessel_j1_double.cu ; +run test_bessel_j1_float.cu ; +run test_bessel_k0_double.cu ; +run test_bessel_k0_float.cu ; +run test_bessel_k1_double.cu ; +run test_bessel_k1_float.cu ; +run test_bessel_kn_double.cu ; +run test_bessel_kn_float.cu ; +run test_bessel_y0_double.cu ; +run test_bessel_y0_float.cu ; +run test_bessel_y1_double.cu ; +run test_bessel_y1_float.cu ; +run test_cyl_bessel_i_double.cu ; +run test_cyl_bessel_i_float.cu ; +run test_cyl_bessel_j_double.cu ; +run test_cyl_bessel_j_float.cu ; +run test_cyl_bessel_k_double.cu ; +run test_cyl_bessel_k_float.cu ; +run test_sph_bessel_double.cu ; +run test_sph_bessel_float.cu ; +run test_cyl_neumann_double.cu ; +run test_cyl_neumann_float.cu ; +run test_sph_neumann_double.cu ; +run test_sph_neumann_float.cu ; + run test_cbrt_double.cu ; run test_cbrt_float.cu ; + run test_changesign_double.cu ; run test_changesign_float.cu ; + run test_cos_pi_double.cu ; run test_cos_pi_float.cu ; + run test_digamma_double.cu ; run test_digamma_float.cu ; + run test_erf_double.cu ; run test_erf_float.cu ; run test_erf_inv_double.cu ; @@ -100,21 +145,29 @@ run test_erfc_double.cu ; run test_erfc_float.cu ; run test_erfc_inv_double.cu ; run test_erfc_inv_float.cu ; + run test_expm1_double.cu ; run test_expm1_float.cu ; + run test_lgamma_double.cu ; run test_lgamma_float.cu ; -run test_log1p_double.cu ; -run test_log1p_float.cu ; -run test_modf_double.cu ; -run test_modf_float.cu ; -run test_round_double.cu ; -run test_round_float.cu ; -run test_sin_pi_double.cu ; -run test_sin_pi_float.cu ; run test_tgamma_double.cu ; run test_tgamma_float.cu ; + +run test_log1p_double.cu ; +run test_log1p_float.cu ; + +run test_modf_double.cu ; +run test_modf_float.cu ; + +run test_round_double.cu ; +run test_round_float.cu ; + +run test_sin_pi_double.cu ; +run test_sin_pi_float.cu ; + run test_trigamma_double.cu ; run test_trigamma_float.cu ; + run test_trunc_double.cu ; run test_trunc_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 7e57f93ce..de235822e 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -90,12 +90,47 @@ run test_saspoint5_quan_nvrtc_float.cpp ; # Special Functions run test_beta_nvrtc_double.cpp ; run test_beta_nvrtc_float.cpp ; + +run test_bessel_i0_nvrtc_double.cpp ; +run test_bessel_i0_nvrtc_float.cpp ; +run test_bessel_i1_nvrtc_double.cpp ; +run test_bessel_i1_nvrtc_float.cpp ; +run test_bessel_j0_nvrtc_double.cpp ; +run test_bessel_j0_nvrtc_float.cpp ; +run test_bessel_j1_nvrtc_double.cpp ; +run test_bessel_j1_nvrtc_float.cpp ; +run test_bessel_k0_nvrtc_double.cpp ; +run test_bessel_k0_nvrtc_float.cpp ; +run test_bessel_k1_nvrtc_double.cpp ; +run test_bessel_k1_nvrtc_float.cpp ; +run test_bessel_kn_nvrtc_double.cpp ; +run test_bessel_kn_nvrtc_float.cpp ; +run test_bessel_y0_nvrtc_double.cpp ; +run test_bessel_y0_nvrtc_float.cpp ; +run test_bessel_y1_nvrtc_double.cpp ; +run test_bessel_y1_nvrtc_float.cpp ; +run test_cyl_bessel_i_nvrtc_double.cpp ; +run test_cyl_bessel_i_nvrtc_float.cpp ; +run test_cyl_bessel_j_nvrtc_double.cpp ; +run test_cyl_bessel_j_nvrtc_float.cpp ; +run test_cyl_bessel_k_nvrtc_double.cpp ; +run test_cyl_bessel_k_nvrtc_float.cpp ; +run test_sph_bessel_nvrtc_double.cpp ; +run test_sph_bessel_nvrtc_float.cpp ; +run test_cyl_neumann_nvrtc_double.cpp ; +run test_cyl_neumann_nvrtc_float.cpp ; +run test_sph_neumann_nvrtc_double.cpp ; +run test_sph_neumann_nvrtc_float.cpp ; + run test_cbrt_nvrtc_double.cpp ; run test_cbrt_nvrtc_float.cpp ; + run test_cos_pi_nvrtc_double.cpp ; run test_cos_pi_nvrtc_float.cpp ; + run test_digamma_nvrtc_double.cpp ; run test_digamma_nvrtc_float.cpp ; + run test_erf_nvrtc_double.cpp ; run test_erf_nvrtc_float.cpp ; run test_erfc_nvrtc_double.cpp ; @@ -104,22 +139,32 @@ run test_erf_inv_nvrtc_double.cpp ; run test_erf_inv_nvrtc_float.cpp ; run test_erfc_inv_nvrtc_double.cpp ; run test_erfc_inv_nvrtc_float.cpp ; + run test_expm1_nvrtc_double.cpp ; run test_expm1_nvrtc_float.cpp ; + run test_fpclassify_nvrtc_double.cpp ; run test_fpclassify_nvrtc_float.cpp ; + run test_gamma_nvrtc_double.cpp ; run test_gamma_nvrtc_float.cpp ; + run test_log1p_nvrtc_double.cpp ; run test_log1p_nvrtc_float.cpp ; + run test_modf_nvrtc_double.cpp ; run test_modf_nvrtc_float.cpp ; + run test_round_nvrtc_double.cpp ; run test_round_nvrtc_float.cpp ; + run test_sign_nvrtc_double.cpp ; run test_sign_nvrtc_float.cpp ; + run test_sin_pi_nvrtc_double.cpp ; run test_sin_pi_nvrtc_float.cpp ; + run test_trigamma_nvrtc_double.cpp ; run test_trigamma_nvrtc_float.cpp ; + run test_trunc_nvrtc_double.cpp ; diff --git a/test/sycl_jamfile b/test/sycl_jamfile index d0a458cce..97c48474c 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -25,6 +25,10 @@ run test_saspoint5.cpp ; # Special Functions run pow_test.cpp ; run test_beta_simple.cpp ; +run test_bessel_i.cpp ; +run test_bessel_j.cpp ; +run test_bessel_k.cpp ; +run test_bessel_y.cpp ; run test_cbrt.cpp ; run test_sign.cpp ; run test_round.cpp ; diff --git a/test/test_bessel_i.cpp b/test/test_bessel_i.cpp index 68dcab0a5..70aac91e4 100644 --- a/test/test_bessel_i.cpp +++ b/test/test_bessel_i.cpp @@ -3,7 +3,13 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#else +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +#include +#endif + #include "test_bessel_i.hpp" // @@ -82,7 +88,11 @@ void expected_results() "linux", // platform largest_type, // test type(s) ".*Random.*", // test data group + #ifdef SYCL_LANGUAGE_VERSION + ".*", 600, 200); + #else ".*", 400, 200); // test function + #endif add_expected_result( "GNU.*", // compiler @@ -111,7 +121,11 @@ void expected_results() ".*", // platform largest_type, // test type(s) ".*", // test data group + #ifdef SYCL_LANGUAGE_VERSION + ".*", 400, 200); + #else ".*", 20, 10); // test function + #endif // // Set error rates a little higher for real_concept - // now that we use a series approximation for small z diff --git a/test/test_bessel_i.hpp b/test/test_bessel_i.hpp index 2da559f32..aa4f6a4ea 100644 --- a/test/test_bessel_i.hpp +++ b/test/test_bessel_i.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include "functor.hpp" @@ -180,7 +181,10 @@ void test_bessel(T, const char* name) // // Special cases for full coverage: // + #ifndef BOOST_MATH_NO_EXCEPTIONS BOOST_CHECK_THROW(boost::math::cyl_bessel_i(T(-2.5), T(-2.5)), std::domain_error); + #endif + BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(0), T(0)), T(1)); BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(10), T(0)), T(0)); BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(-10), T(0)), T(0)); @@ -197,10 +201,12 @@ void test_bessel(T, const char* name) } } T tolerance = boost::math::tools::epsilon() * 100; +#ifndef SYCL_LANGUAGE_VERSION if ((boost::math::tools::digits() <= std::numeric_limits::digits) && (std::numeric_limits::max_exponent > 1000)) { BOOST_CHECK_CLOSE_FRACTION(boost::math::cyl_bessel_i(T(0.5), T(710)), SC_(3.3447452278080108123142599104927325061327359278058601201179e306), tolerance); } +#endif #if LDBL_MAX_EXP >= 11356 BOOST_IF_CONSTEXPR (std::numeric_limits::max_exponent >= 11356) { diff --git a/test/test_bessel_i0_double.cu b/test/test_bessel_i0_double.cu new file mode 100644 index 000000000..1c5d0ca14 --- /dev/null +++ b/test/test_bessel_i0_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_i0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_i0_float.cu b/test/test_bessel_i0_float.cu new file mode 100644 index 000000000..39929d548 --- /dev/null +++ b/test/test_bessel_i0_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_i0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_i0_nvrtc_double.cpp b/test/test_bessel_i0_nvrtc_double.cpp new file mode 100644 index 000000000..0c5db47b4 --- /dev/null +++ b/test/test_bessel_i0_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_i0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_i0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_i0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_i0_nvrtc_float.cpp b/test/test_bessel_i0_nvrtc_float.cpp new file mode 100644 index 000000000..26d667b97 --- /dev/null +++ b/test/test_bessel_i0_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_i0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_i0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_i0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_i1_double.cu b/test/test_bessel_i1_double.cu new file mode 100644 index 000000000..e4d6443a6 --- /dev/null +++ b/test/test_bessel_i1_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_i1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_i1_float.cu b/test/test_bessel_i1_float.cu new file mode 100644 index 000000000..12ae53542 --- /dev/null +++ b/test/test_bessel_i1_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_i1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_i1_nvrtc_double.cpp b/test/test_bessel_i1_nvrtc_double.cpp new file mode 100644 index 000000000..c270a6694 --- /dev/null +++ b/test/test_bessel_i1_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_i1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_i1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_i1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_i1_nvrtc_float.cpp b/test/test_bessel_i1_nvrtc_float.cpp new file mode 100644 index 000000000..158c6a815 --- /dev/null +++ b/test/test_bessel_i1_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_i1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_i1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_i1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_i1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_j.cpp b/test/test_bessel_j.cpp index 19a5f7426..516e34c29 100644 --- a/test/test_bessel_j.cpp +++ b/test/test_bessel_j.cpp @@ -3,7 +3,12 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#else +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +#include +#endif #include "test_bessel_j.hpp" diff --git a/test/test_bessel_j.hpp b/test/test_bessel_j.hpp index 82106213e..c0b719ad8 100644 --- a/test/test_bessel_j.hpp +++ b/test/test_bessel_j.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -279,7 +280,9 @@ void test_bessel(T, const char* name) BOOST_MATH_CHECK_THROW(boost::math::sph_bessel(2, T(-2.0)), std::domain_error); BOOST_CHECK_EQUAL(boost::math::cyl_bessel_j(T(0), T(2.5)), boost::math::cyl_bessel_j(T(0), T(-2.5))); BOOST_CHECK_EQUAL(boost::math::cyl_bessel_j(T(1), T(2.5)), -boost::math::cyl_bessel_j(T(1), T(-2.5))); + #ifndef SYCL_LANGUAGE_VERSION BOOST_CHECK_CLOSE_FRACTION(boost::math::cyl_bessel_j(364, T(38.5)), SC_(1.793940496519190500748409872348034004417458734118663909894e-309), tolerance); + #endif // // Special cases at infinity: // diff --git a/test/test_bessel_j0_double.cu b/test/test_bessel_j0_double.cu new file mode 100644 index 000000000..d32474d96 --- /dev/null +++ b/test/test_bessel_j0_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_j0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_j0_float.cu b/test/test_bessel_j0_float.cu new file mode 100644 index 000000000..48c6b9e39 --- /dev/null +++ b/test/test_bessel_j0_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_j0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_j0_nvrtc_double.cpp b/test/test_bessel_j0_nvrtc_double.cpp new file mode 100644 index 000000000..8c8b79841 --- /dev/null +++ b/test/test_bessel_j0_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_j0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_j0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_j0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_j0_nvrtc_float.cpp b/test/test_bessel_j0_nvrtc_float.cpp new file mode 100644 index 000000000..4a54b1eaa --- /dev/null +++ b/test/test_bessel_j0_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_j0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_j0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_j0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_j1_double.cu b/test/test_bessel_j1_double.cu new file mode 100644 index 000000000..33a6e71b6 --- /dev/null +++ b/test/test_bessel_j1_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_j1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_j1_float.cu b/test/test_bessel_j1_float.cu new file mode 100644 index 000000000..14dd37be3 --- /dev/null +++ b/test/test_bessel_j1_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_j1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_j1_nvrtc_double.cpp b/test/test_bessel_j1_nvrtc_double.cpp new file mode 100644 index 000000000..11460c11d --- /dev/null +++ b/test/test_bessel_j1_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_j1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_j1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_j1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_j1_nvrtc_float.cpp b/test/test_bessel_j1_nvrtc_float.cpp new file mode 100644 index 000000000..8f7cc6e3f --- /dev/null +++ b/test/test_bessel_j1_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_j1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_j1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_j1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_j1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_k.cpp b/test/test_bessel_k.cpp index f0975b46d..d4ab7721f 100644 --- a/test/test_bessel_k.cpp +++ b/test/test_bessel_k.cpp @@ -5,7 +5,12 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#else +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +#include +#endif #ifdef _MSC_VER # pragma warning(disable : 4756) // overflow in constant arithmetic diff --git a/test/test_bessel_k.hpp b/test/test_bessel_k.hpp index 22df3218f..6a2a8179d 100644 --- a/test/test_bessel_k.hpp +++ b/test/test_bessel_k.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include "functor.hpp" @@ -175,6 +176,7 @@ void test_bessel(T, const char* name) // // Extra test coverage: // + #ifndef SYCL_LANGUAGE_VERSION // SYCL doesn't throw BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(2), T(-1)), std::domain_error); BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(2.2), T(-1)), std::domain_error); BOOST_IF_CONSTEXPR(std::numeric_limits::has_infinity) @@ -194,6 +196,7 @@ void test_bessel(T, const char* name) BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(-1.25), T(0)), std::domain_error); BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(-1), T(0)), std::domain_error); BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(1), T(0)), std::domain_error); + #endif } diff --git a/test/test_bessel_k0_double.cu b/test/test_bessel_k0_double.cu new file mode 100644 index 000000000..26d0e2bff --- /dev/null +++ b/test/test_bessel_k0_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_k0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_k0_float.cu b/test/test_bessel_k0_float.cu new file mode 100644 index 000000000..ffe59c25b --- /dev/null +++ b/test/test_bessel_k0_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k0(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_k0(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_k0_nvrtc_double.cpp b/test/test_bessel_k0_nvrtc_double.cpp new file mode 100644 index 000000000..d41221212 --- /dev/null +++ b/test/test_bessel_k0_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_k0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_k0_nvrtc_float.cpp b/test/test_bessel_k0_nvrtc_float.cpp new file mode 100644 index 000000000..389fce21a --- /dev/null +++ b/test/test_bessel_k0_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k0(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_k0(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_k1_double.cu b/test/test_bessel_k1_double.cu new file mode 100644 index 000000000..ed1b353d9 --- /dev/null +++ b/test/test_bessel_k1_double.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_k1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_k1_float.cu b/test/test_bessel_k1_float.cu new file mode 100644 index 000000000..65fd802f2 --- /dev/null +++ b/test/test_bessel_k1_float.cu @@ -0,0 +1,100 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k1(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::detail::bessel_k1(input_vector[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_k1_nvrtc_double.cpp b/test/test_bessel_k1_nvrtc_double.cpp new file mode 100644 index 000000000..1e0f1e7f4 --- /dev/null +++ b/test/test_bessel_k1_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_k1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_k1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_k1_nvrtc_float.cpp b/test/test_bessel_k1_nvrtc_float.cpp new file mode 100644 index 000000000..1422a5886 --- /dev/null +++ b/test/test_bessel_k1_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_k1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_k1(in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k1_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k1_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k1_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_k1(h_in1[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_kn_double.cu b/test/test_bessel_kn_double.cu new file mode 100644 index 000000000..d15ba7304 --- /dev/null +++ b/test/test_bessel_kn_double.cu @@ -0,0 +1,105 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_kn(2, in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + boost::math::policies::policy<> pol; + w.reset(); + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_kn(2, input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_kn_float.cu b/test/test_bessel_kn_float.cu new file mode 100644 index 000000000..d15ba7304 --- /dev/null +++ b/test/test_bessel_kn_float.cu @@ -0,0 +1,105 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + + if (i < numElements) + { + out[i] = boost::math::detail::bessel_kn(2, in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + boost::math::policies::policy<> pol; + w.reset(); + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_kn(2, input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_kn_nvrtc_double.cpp b/test/test_bessel_kn_nvrtc_double.cpp new file mode 100644 index 000000000..3b581f77c --- /dev/null +++ b/test/test_bessel_kn_nvrtc_double.cpp @@ -0,0 +1,192 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_bessel_kn_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_kn(2, in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_kn_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_kn_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_kn_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_kn(2, h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_kn_nvrtc_float.cpp b/test/test_bessel_kn_nvrtc_float.cpp new file mode 100644 index 000000000..dcc987a70 --- /dev/null +++ b/test/test_bessel_kn_nvrtc_float.cpp @@ -0,0 +1,192 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_bessel_kn_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_kn(2, in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_kn_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_kn_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_kn_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_kn(2, h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_y.cpp b/test/test_bessel_y.cpp index 83c24b95f..0bbefba55 100644 --- a/test/test_bessel_y.cpp +++ b/test/test_bessel_y.cpp @@ -3,7 +3,12 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#else +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +#include +#endif #include "test_bessel_y.hpp" @@ -234,7 +239,11 @@ void expected_results() ".*", // platform largest_type, // test type(s) ".*(Y[nv]|y).*Random.*", // test data group + #ifdef SYCL_LANGUAGE_VERSION + ".*", 2000, 1000); + #else ".*", 1500, 1000); // test function + #endif // // Fallback for sun has to go after the general cases above: // diff --git a/test/test_bessel_y.hpp b/test/test_bessel_y.hpp index 28361a227..14b0be456 100644 --- a/test/test_bessel_y.hpp +++ b/test/test_bessel_y.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -241,10 +242,12 @@ void test_bessel(T, const char* name) BOOST_CHECK_EQUAL(boost::math::sph_neumann(2, std::numeric_limits::infinity()), T(0)); } + #ifndef BOOST_MATH_NO_EXCEPTIONS BOOST_CHECK_THROW(boost::math::cyl_neumann(T(0), T(-1)), std::domain_error); BOOST_CHECK_THROW(boost::math::cyl_neumann(T(0.2), T(-1)), std::domain_error); BOOST_CHECK_THROW(boost::math::cyl_neumann(T(2), T(0)), std::domain_error); BOOST_CHECK_THROW(boost::math::sph_neumann(2, T(-2)), std::domain_error); + #endif #if LDBL_MAX_EXP > 1024 if (std::numeric_limits::max_exponent > 1024) { diff --git a/test/test_bessel_y0_double.cu b/test/test_bessel_y0_double.cu new file mode 100644 index 000000000..c8deada7d --- /dev/null +++ b/test/test_bessel_y0_double.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y0(in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + + boost::math::policies::policy<> pol; + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_y0(input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_y0_float.cu b/test/test_bessel_y0_float.cu new file mode 100644 index 000000000..c8deada7d --- /dev/null +++ b/test/test_bessel_y0_float.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y0(in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + + boost::math::policies::policy<> pol; + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_y0(input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_y0_nvrtc_double.cpp b/test/test_bessel_y0_nvrtc_double.cpp new file mode 100644 index 000000000..8645a0fdd --- /dev/null +++ b/test/test_bessel_y0_nvrtc_double.cpp @@ -0,0 +1,194 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y0(in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_y0(h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_y0_nvrtc_float.cpp b/test/test_bessel_y0_nvrtc_float.cpp new file mode 100644 index 000000000..75a065bd6 --- /dev/null +++ b/test/test_bessel_y0_nvrtc_float.cpp @@ -0,0 +1,194 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y0(in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_y0(h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_y1_double.cu b/test/test_bessel_y1_double.cu new file mode 100644 index 000000000..a5b3051b4 --- /dev/null +++ b/test/test_bessel_y1_double.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y1(in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + + boost::math::policies::policy<> pol; + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_y1(input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_y1_float.cu b/test/test_bessel_y1_float.cu new file mode 100644 index 000000000..532aaf328 --- /dev/null +++ b/test/test_bessel_y1_float.cu @@ -0,0 +1,106 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y1(in[i], pol); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + + boost::math::policies::policy<> pol; + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::detail::bessel_y1(input_vector[i], pol)); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_bessel_y1_nvrtc_double.cpp b/test/test_bessel_y1_nvrtc_double.cpp new file mode 100644 index 000000000..383d879eb --- /dev/null +++ b/test/test_bessel_y1_nvrtc_double.cpp @@ -0,0 +1,194 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y1(in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_y1(h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_bessel_y1_nvrtc_float.cpp b/test/test_bessel_y1_nvrtc_float.cpp new file mode 100644 index 000000000..c2c1355e6 --- /dev/null +++ b/test/test_bessel_y1_nvrtc_float.cpp @@ -0,0 +1,194 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +#include +extern "C" __global__ +void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + boost::math::policies::policy<> pol; + if (i < numElements) + { + out[i] = boost::math::detail::bessel_y1(in1[i], pol); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_bessel_k0_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + boost::math::policies::policy<> pol; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::detail::bessel_y1(h_in1[i], pol); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_bessel_i_double.cu b/test/test_cyl_bessel_i_double.cu new file mode 100644 index 000000000..91a3ed8eb --- /dev/null +++ b/test/test_cyl_bessel_i_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_i(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_bessel_i_float.cu b/test/test_cyl_bessel_i_float.cu new file mode 100644 index 000000000..5aad1be88 --- /dev/null +++ b/test/test_cyl_bessel_i_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_i(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_bessel_i_nvrtc_double.cpp b/test/test_cyl_bessel_i_nvrtc_double.cpp new file mode 100644 index 000000000..50bfc0c79 --- /dev/null +++ b/test/test_cyl_bessel_i_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_i_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_i_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_i_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_i_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_i(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_bessel_i_nvrtc_float.cpp b/test/test_cyl_bessel_i_nvrtc_float.cpp new file mode 100644 index 000000000..c73992a27 --- /dev/null +++ b/test/test_cyl_bessel_i_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_i_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_i_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_i_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_i_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_i(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_bessel_j_double.cu b/test/test_cyl_bessel_j_double.cu new file mode 100644 index 000000000..b5d93f1dd --- /dev/null +++ b/test/test_cyl_bessel_j_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_j(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_bessel_j_float.cu b/test/test_cyl_bessel_j_float.cu new file mode 100644 index 000000000..3edc2a7c9 --- /dev/null +++ b/test/test_cyl_bessel_j_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_j(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_bessel_j_nvrtc_double.cpp b/test/test_cyl_bessel_j_nvrtc_double.cpp new file mode 100644 index 000000000..f74e112ed --- /dev/null +++ b/test/test_cyl_bessel_j_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_j_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_j(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_bessel_j_nvrtc_float.cpp b/test/test_cyl_bessel_j_nvrtc_float.cpp new file mode 100644 index 000000000..e3d792843 --- /dev/null +++ b/test/test_cyl_bessel_j_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_j_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_j(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_bessel_k_double.cu b/test/test_cyl_bessel_k_double.cu new file mode 100644 index 000000000..3dfd2bf38 --- /dev/null +++ b/test/test_cyl_bessel_k_double.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_k(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_bessel_k_float.cu b/test/test_cyl_bessel_k_float.cu new file mode 100644 index 000000000..b874857a0 --- /dev/null +++ b/test/test_cyl_bessel_k_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_bessel_k(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_bessel_k_nvrtc_double.cpp b/test/test_cyl_bessel_k_nvrtc_double.cpp new file mode 100644 index 000000000..66a8b1490 --- /dev/null +++ b/test/test_cyl_bessel_k_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_k_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_k_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_k_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_k_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_k(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_bessel_k_nvrtc_float.cpp b/test/test_cyl_bessel_k_nvrtc_float.cpp new file mode 100644 index 000000000..e23ff82c0 --- /dev/null +++ b/test/test_cyl_bessel_k_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_k_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_k_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_k_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_k_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_bessel_k(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_neumann_double.cu b/test/test_cyl_neumann_double.cu new file mode 100644 index 000000000..0e7a72ff9 --- /dev/null +++ b/test/test_cyl_neumann_double.cu @@ -0,0 +1,116 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_neumann(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_neumann(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(output_vector[i]) && std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000) + { + std::cout << "error at line: " << i + << "\nParallel: " << results[i] + << "\n Serial: " << output_vector[i] + << "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_neumann_float.cu b/test/test_cyl_neumann_float.cu new file mode 100644 index 000000000..f621d2fc6 --- /dev/null +++ b/test/test_cyl_neumann_float.cu @@ -0,0 +1,104 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::cyl_neumann(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::cyl_neumann(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cyl_neumann_nvrtc_double.cpp b/test/test_cyl_neumann_nvrtc_double.cpp new file mode 100644 index 000000000..78bbd3b5c --- /dev/null +++ b/test/test_cyl_neumann_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_neumann(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_neumann_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_neumann_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_neumann_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_neumann(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_cyl_neumann_nvrtc_float.cpp b/test/test_cyl_neumann_nvrtc_float.cpp new file mode 100644 index 000000000..78bbd3b5c --- /dev/null +++ b/test/test_cyl_neumann_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::cyl_neumann(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_neumann_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_neumann_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_neumann_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::cyl_neumann(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_bessel_double.cu b/test/test_sph_bessel_double.cu new file mode 100644 index 000000000..5229dd8b5 --- /dev/null +++ b/test/test_sph_bessel_double.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const unsigned *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_bessel(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng {42}; + std::uniform_int_distribution order(1, 100); + std::uniform_real_distribution val(0, 100); + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = order(rng); + input_vector2[i] = val(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::sph_bessel(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + bool failed = false; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(output_vector[i]) && std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 3000) + { + std::cout << "error at line: " << i + << "\nParallel: " << results[i] + << "\n Serial: " << output_vector[i] + << "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_bessel_float.cu b/test/test_sph_bessel_float.cu new file mode 100644 index 000000000..bd068a1a0 --- /dev/null +++ b/test/test_sph_bessel_float.cu @@ -0,0 +1,119 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const unsigned *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_bessel(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng {42}; + std::uniform_int_distribution order(1, 100); + std::uniform_real_distribution val(0, 100); + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = order(rng); + input_vector2[i] = val(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::sph_bessel(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + // check the results + bool failed = false; + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(output_vector[i]) && std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 150) + { + std::cout << "error at line: " << i + << "\nParallel: " << results[i] + << "\n Serial: " << output_vector[i] + << "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_bessel_nvrtc_double.cpp b/test/test_sph_bessel_nvrtc_double.cpp new file mode 100644 index 000000000..e88726ed7 --- /dev/null +++ b/test/test_sph_bessel_nvrtc_double.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_j_kernel(const unsigned *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_bessel(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + unsigned *h_in1, *d_in1; + float_type *h_in2, *h_out; + float_type *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new unsigned[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_int_distribution order(1, 100); + std::uniform_real_distribution val(0.0f, 100.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(order(rng)); + h_in2[i] = static_cast(val(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(unsigned)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(unsigned), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + bool failed = false; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_bessel(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 3000) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + failed = true; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (failed) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_bessel_nvrtc_float.cpp b/test/test_sph_bessel_nvrtc_float.cpp new file mode 100644 index 000000000..c9538cd5b --- /dev/null +++ b/test/test_sph_bessel_nvrtc_float.cpp @@ -0,0 +1,199 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_cyl_bessel_j_kernel(const unsigned *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_bessel(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + unsigned *h_in1, *d_in1; + float_type *h_in2, *h_out; + float_type *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new unsigned[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_int_distribution order(1, 100); + std::uniform_real_distribution val(0.0f, 100.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(order(rng)); + h_in2[i] = static_cast(val(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(unsigned)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(unsigned), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + bool failed = false; + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_bessel(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 3000) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + failed = true; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + if (failed) + { + return 1; + } + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_neumann_double.cu b/test/test_sph_neumann_double.cu new file mode 100644 index 000000000..f59dc7acc --- /dev/null +++ b/test/test_sph_neumann_double.cu @@ -0,0 +1,116 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_neumann(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::sph_neumann(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(output_vector[i]) && std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000) + { + std::cout << "error at line: " << i + << "\nParallel: " << results[i] + << "\n Serial: " << output_vector[i] + << "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_neumann_float.cu b/test/test_sph_neumann_float.cu new file mode 100644 index 000000000..a295e376f --- /dev/null +++ b/test/test_sph_neumann_float.cu @@ -0,0 +1,116 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::sph_neumann(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = rand()/(float_type)RAND_MAX; + input_vector2[i] = rand()/(float_type)RAND_MAX; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::sph_neumann(input_vector1[i], input_vector2[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (std::isfinite(output_vector[i]) && std::isfinite(results[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000) + { + std::cout << "error at line: " << i + << "\nParallel: " << results[i] + << "\n Serial: " << output_vector[i] + << "\n Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_sph_neumann_nvrtc_double.cpp b/test/test_sph_neumann_nvrtc_double.cpp new file mode 100644 index 000000000..61dcb07dd --- /dev/null +++ b/test/test_sph_neumann_nvrtc_double.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_sph_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_neumann(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_neumann_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_neumann_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_neumann_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_neumann(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_sph_neumann_nvrtc_float.cpp b/test/test_sph_neumann_nvrtc_float.cpp new file mode 100644 index 000000000..5d7ae59fe --- /dev/null +++ b/test/test_sph_neumann_nvrtc_float.cpp @@ -0,0 +1,190 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_sph_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::sph_neumann(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_neumann_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_sph_neumann_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_neumann_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::sph_neumann(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +}