From d01893d215f9069849f34c667c467b1112882e64 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 3 Sep 2024 10:48:07 -0400 Subject: [PATCH] Add GPU markers to fisher f dist Add SYCL testing of fisher f dist Add CUDA fisher f dist testing Add NVRTC fisher f dist testing Add GPU support to gamma dist Add SYCL testing of gamma dist Add CUDA gamma dist testing Add NVRTC gamma dist testing Reduce number of threads per block since it can crash CI Add GPU support to the geometric dist Add SYCL testing of geometric dist Add cuda::std::tie Add GPU support to inv_discrete_quantile Add CUDA testing of geometric dist Add NVRTC testing of geometric dist Add SYCL testing of inverse_chi_squared dist Adjust tol Add NVRTC inverse chi squared dist testing Add CUDA inverse chi squared dist testing Add GPU support to inverse gamma dist Add SYCL testing to inverse gamma dist Add NVRTC testing of inverse gamma dist Add CUDA testing of inverse gamma dist --- .../detail/inv_discrete_quantile.hpp | 71 ++++--- include/boost/math/distributions/fisher_f.hpp | 69 +++---- include/boost/math/distributions/gamma.hpp | 84 ++++---- .../boost/math/distributions/geometric.hpp | 93 +++++---- .../distributions/inverse_chi_squared.hpp | 68 +++---- .../math/distributions/inverse_gamma.hpp | 77 +++---- include/boost/math/tools/tuple.hpp | 1 + test/cuda_jamfile | 35 ++++ test/nvrtc_jamfile | 35 ++++ test/sycl_jamfile | 5 + test/test_arcsine_cdf_double.cu | 2 +- test/test_arcsine_cdf_float.cu | 2 +- test/test_arcsine_pdf_double.cu | 2 +- test/test_arcsine_pdf_float.cu | 2 +- test/test_arcsine_quan_double.cu | 2 +- test/test_arcsine_quan_float.cu | 2 +- test/test_arcsine_range_support_double.cu | 2 +- test/test_arcsine_range_support_float.cu | 2 +- test/test_bernoulli_cdf_double.cu | 2 +- test/test_bernoulli_cdf_float.cu | 2 +- test/test_bernoulli_pdf_double.cu | 2 +- test/test_bernoulli_pdf_float.cu | 2 +- test/test_bernoulli_range_support_double.cu | 2 +- test/test_bernoulli_range_support_float.cu | 2 +- test/test_beta_dist_cdf_double.cu | 2 +- test/test_beta_dist_cdf_float.cu | 2 +- test/test_beta_dist_pdf_double.cu | 2 +- test/test_beta_dist_pdf_float.cu | 2 +- test/test_beta_dist_quan_double.cu | 2 +- test/test_beta_dist_quan_float.cu | 2 +- test/test_cauchy_cdf_double.cu | 2 +- test/test_cauchy_cdf_float.cu | 2 +- test/test_cauchy_pdf_double.cu | 2 +- test/test_cauchy_pdf_float.cu | 2 +- test/test_cauchy_quan_double.cu | 2 +- test/test_cauchy_quan_float.cu | 2 +- test/test_cauchy_range_support_double.cu | 2 +- test/test_cauchy_range_support_float.cu | 2 +- test/test_chi_squared_cdf_double.cu | 2 +- test/test_chi_squared_cdf_float.cu | 2 +- test/test_chi_squared_pdf_double.cu | 2 +- test/test_chi_squared_pdf_float.cu | 2 +- test/test_chi_squared_quan_double.cu | 2 +- test/test_chi_squared_quan_float.cu | 2 +- test/test_exponential_cdf_double.cu | 2 +- test/test_exponential_cdf_float.cu | 2 +- test/test_exponential_pdf_double.cu | 2 +- test/test_exponential_pdf_float.cu | 2 +- test/test_exponential_quan_double.cu | 2 +- test/test_exponential_quan_float.cu | 2 +- test/test_exponential_range_support_double.cu | 2 +- test/test_exponential_range_support_float.cu | 2 +- test/test_extreme_value_cdf_double.cu | 2 +- test/test_extreme_value_cdf_float.cu | 2 +- test/test_extreme_value_pdf_double.cu | 2 +- test/test_extreme_value_pdf_float.cu | 2 +- test/test_extreme_value_quan_double.cu | 2 +- test/test_extreme_value_quan_float.cu | 2 +- test/test_fisher_f.cpp | 6 +- test/test_fisher_f_cdf_double.cu | 109 ++++++++++ test/test_fisher_f_cdf_float.cu | 109 ++++++++++ test/test_fisher_f_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_fisher_f_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_fisher_f_pdf_double.cu | 109 ++++++++++ test/test_fisher_f_pdf_float.cu | 109 ++++++++++ test/test_fisher_f_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_fisher_f_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_fisher_f_quan_double.cu | 109 ++++++++++ test/test_fisher_f_quan_float.cu | 109 ++++++++++ test/test_fisher_f_quan_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_fisher_f_quan_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_gamma_dist.cpp | 9 +- test/test_gamma_dist_cdf_double.cu | 109 ++++++++++ test/test_gamma_dist_cdf_float.cu | 109 ++++++++++ test/test_gamma_dist_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_gamma_dist_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_gamma_dist_pdf_double.cu | 109 ++++++++++ test/test_gamma_dist_pdf_float.cu | 109 ++++++++++ test/test_gamma_dist_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_gamma_dist_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_gamma_dist_quan_double.cu | 109 ++++++++++ test/test_gamma_dist_quan_float.cu | 109 ++++++++++ test/test_gamma_dist_quan_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_gamma_dist_quan_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_geometric.cpp | 18 +- test/test_geometric_dist_cdf_double.cu | 109 ++++++++++ test/test_geometric_dist_cdf_float.cu | 109 ++++++++++ test/test_geometric_dist_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_geometric_dist_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_geometric_dist_pdf_double.cu | 109 ++++++++++ test/test_geometric_dist_pdf_float.cu | 109 ++++++++++ test/test_geometric_dist_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_geometric_dist_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_geometric_dist_quan_double.cu | 109 ++++++++++ test/test_geometric_dist_quan_float.cu | 109 ++++++++++ .../test_geometric_dist_quan_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_geometric_dist_quan_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_holtsmark_cdf_double.cu | 2 +- test/test_holtsmark_cdf_float.cu | 2 +- test/test_holtsmark_pdf_double.cu | 2 +- test/test_holtsmark_pdf_float.cu | 2 +- test/test_inverse_chi_squared_cdf_double.cu | 110 ++++++++++ test/test_inverse_chi_squared_cdf_float.cu | 110 ++++++++++ ...t_inverse_chi_squared_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++ ...st_inverse_chi_squared_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++ .../test_inverse_chi_squared_distribution.cpp | 7 +- test/test_inverse_chi_squared_pdf_double.cu | 110 ++++++++++ test/test_inverse_chi_squared_pdf_float.cu | 110 ++++++++++ ...t_inverse_chi_squared_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++ ...st_inverse_chi_squared_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_inverse_chi_squared_quan_double.cu | 110 ++++++++++ test/test_inverse_chi_squared_quan_float.cu | 110 ++++++++++ ..._inverse_chi_squared_quan_nvrtc_double.cpp | 191 ++++++++++++++++++ ...t_inverse_chi_squared_quan_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_cdf_double.cu | 110 ++++++++++ test/test_inverse_gamma_cdf_float.cu | 110 ++++++++++ test/test_inverse_gamma_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_distribution.cpp | 7 +- test/test_inverse_gamma_pdf_double.cu | 110 ++++++++++ test/test_inverse_gamma_pdf_float.cu | 110 ++++++++++ test/test_inverse_gamma_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_quan_double.cu | 110 ++++++++++ test/test_inverse_gamma_quan_float.cu | 110 ++++++++++ test/test_inverse_gamma_quan_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_inverse_gamma_quan_nvrtc_float.cpp | 191 ++++++++++++++++++ test/test_landau_cdf_double.cu | 2 +- test/test_landau_cdf_float.cu | 2 +- test/test_landau_pdf_double.cu | 2 +- test/test_landau_pdf_float.cu | 2 +- test/test_landau_quan_double.cu | 2 +- test/test_landau_quan_float.cu | 2 +- test/test_laplace_cdf_double.cu | 2 +- test/test_laplace_cdf_float.cu | 2 +- test/test_laplace_pdf_double.cu | 2 +- test/test_laplace_pdf_float.cu | 2 +- test/test_laplace_quan_double.cu | 2 +- test/test_laplace_quan_float.cu | 2 +- test/test_logistic_cdf_double.cu | 2 +- test/test_logistic_cdf_float.cu | 2 +- test/test_logistic_pdf_double.cu | 2 +- test/test_logistic_pdf_float.cu | 2 +- test/test_logistic_quan_double.cu | 2 +- test/test_logistic_quan_float.cu | 2 +- test/test_mapairy_cdf_double.cu | 2 +- test/test_mapairy_cdf_float.cu | 2 +- test/test_mapairy_pdf_double.cu | 2 +- test/test_mapairy_pdf_float.cu | 2 +- test/test_mapairy_quan_double.cu | 2 +- test/test_mapairy_quan_float.cu | 2 +- test/test_saspoint5_cdf_double.cu | 2 +- test/test_saspoint5_cdf_float.cu | 2 +- test/test_saspoint5_pdf_double.cu | 2 +- test/test_saspoint5_pdf_float.cu | 2 +- test/test_saspoint5_quan_double.cu | 2 +- test/test_saspoint5_quan_float.cu | 2 +- test/test_weibull_cdf_double.cu | 2 +- test/test_weibull_cdf_float.cu | 2 +- test/test_weibull_pdf_double.cu | 2 +- test/test_weibull_pdf_float.cu | 2 +- test/test_weibull_quan_double.cu | 2 +- test/test_weibull_quan_float.cu | 2 +- 163 files changed, 9449 insertions(+), 324 deletions(-) create mode 100644 test/test_fisher_f_cdf_double.cu create mode 100644 test/test_fisher_f_cdf_float.cu create mode 100644 test/test_fisher_f_cdf_nvrtc_double.cpp create mode 100644 test/test_fisher_f_cdf_nvrtc_float.cpp create mode 100644 test/test_fisher_f_pdf_double.cu create mode 100644 test/test_fisher_f_pdf_float.cu create mode 100644 test/test_fisher_f_pdf_nvrtc_double.cpp create mode 100644 test/test_fisher_f_pdf_nvrtc_float.cpp create mode 100644 test/test_fisher_f_quan_double.cu create mode 100644 test/test_fisher_f_quan_float.cu create mode 100644 test/test_fisher_f_quan_nvrtc_double.cpp create mode 100644 test/test_fisher_f_quan_nvrtc_float.cpp create mode 100644 test/test_gamma_dist_cdf_double.cu create mode 100644 test/test_gamma_dist_cdf_float.cu create mode 100644 test/test_gamma_dist_cdf_nvrtc_double.cpp create mode 100644 test/test_gamma_dist_cdf_nvrtc_float.cpp create mode 100644 test/test_gamma_dist_pdf_double.cu create mode 100644 test/test_gamma_dist_pdf_float.cu create mode 100644 test/test_gamma_dist_pdf_nvrtc_double.cpp create mode 100644 test/test_gamma_dist_pdf_nvrtc_float.cpp create mode 100644 test/test_gamma_dist_quan_double.cu create mode 100644 test/test_gamma_dist_quan_float.cu create mode 100644 test/test_gamma_dist_quan_nvrtc_double.cpp create mode 100644 test/test_gamma_dist_quan_nvrtc_float.cpp create mode 100644 test/test_geometric_dist_cdf_double.cu create mode 100644 test/test_geometric_dist_cdf_float.cu create mode 100644 test/test_geometric_dist_cdf_nvrtc_double.cpp create mode 100644 test/test_geometric_dist_cdf_nvrtc_float.cpp create mode 100644 test/test_geometric_dist_pdf_double.cu create mode 100644 test/test_geometric_dist_pdf_float.cu create mode 100644 test/test_geometric_dist_pdf_nvrtc_double.cpp create mode 100644 test/test_geometric_dist_pdf_nvrtc_float.cpp create mode 100644 test/test_geometric_dist_quan_double.cu create mode 100644 test/test_geometric_dist_quan_float.cu create mode 100644 test/test_geometric_dist_quan_nvrtc_double.cpp create mode 100644 test/test_geometric_dist_quan_nvrtc_float.cpp create mode 100644 test/test_inverse_chi_squared_cdf_double.cu create mode 100644 test/test_inverse_chi_squared_cdf_float.cu create mode 100644 test/test_inverse_chi_squared_cdf_nvrtc_double.cpp create mode 100644 test/test_inverse_chi_squared_cdf_nvrtc_float.cpp create mode 100644 test/test_inverse_chi_squared_pdf_double.cu create mode 100644 test/test_inverse_chi_squared_pdf_float.cu create mode 100644 test/test_inverse_chi_squared_pdf_nvrtc_double.cpp create mode 100644 test/test_inverse_chi_squared_pdf_nvrtc_float.cpp create mode 100644 test/test_inverse_chi_squared_quan_double.cu create mode 100644 test/test_inverse_chi_squared_quan_float.cu create mode 100644 test/test_inverse_chi_squared_quan_nvrtc_double.cpp create mode 100644 test/test_inverse_chi_squared_quan_nvrtc_float.cpp create mode 100644 test/test_inverse_gamma_cdf_double.cu create mode 100644 test/test_inverse_gamma_cdf_float.cu create mode 100644 test/test_inverse_gamma_cdf_nvrtc_double.cpp create mode 100644 test/test_inverse_gamma_cdf_nvrtc_float.cpp create mode 100644 test/test_inverse_gamma_pdf_double.cu create mode 100644 test/test_inverse_gamma_pdf_float.cu create mode 100644 test/test_inverse_gamma_pdf_nvrtc_double.cpp create mode 100644 test/test_inverse_gamma_pdf_nvrtc_float.cpp create mode 100644 test/test_inverse_gamma_quan_double.cu create mode 100644 test/test_inverse_gamma_quan_float.cu create mode 100644 test/test_inverse_gamma_quan_nvrtc_double.cpp create mode 100644 test/test_inverse_gamma_quan_nvrtc_float.cpp diff --git a/include/boost/math/distributions/detail/inv_discrete_quantile.hpp b/include/boost/math/distributions/detail/inv_discrete_quantile.hpp index 739a86666..f688345b7 100644 --- a/include/boost/math/distributions/detail/inv_discrete_quantile.hpp +++ b/include/boost/math/distributions/detail/inv_discrete_quantile.hpp @@ -6,7 +6,11 @@ #ifndef BOOST_MATH_DISTRIBUTIONS_DETAIL_INV_DISCRETE_QUANTILE #define BOOST_MATH_DISTRIBUTIONS_DETAIL_INV_DISCRETE_QUANTILE -#include +#include +#include +#include +#include +#include namespace boost{ namespace math{ namespace detail{ @@ -19,10 +23,10 @@ struct distribution_quantile_finder typedef typename Dist::value_type value_type; typedef typename Dist::policy_type policy_type; - distribution_quantile_finder(const Dist d, value_type p, bool c) + BOOST_MATH_GPU_ENABLED distribution_quantile_finder(const Dist d, value_type p, bool c) : dist(d), target(p), comp(c) {} - value_type operator()(value_type const& x) + BOOST_MATH_GPU_ENABLED value_type operator()(value_type const& x) { return comp ? value_type(target - cdf(complement(dist, x))) : value_type(cdf(dist, x) - target); } @@ -42,24 +46,24 @@ private: // in the root no longer being bracketed. // template -void adjust_bounds(Real& /* a */, Real& /* b */, Tol const& /* tol */){} +BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& /* a */, Real& /* b */, Tol const& /* tol */){} template -void adjust_bounds(Real& /* a */, Real& b, tools::equal_floor const& /* tol */) +BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& /* a */, Real& b, tools::equal_floor const& /* tol */) { BOOST_MATH_STD_USING b -= tools::epsilon() * b; } template -void adjust_bounds(Real& a, Real& /* b */, tools::equal_ceil const& /* tol */) +BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& a, Real& /* b */, tools::equal_ceil const& /* tol */) { BOOST_MATH_STD_USING a += tools::epsilon() * a; } template -void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol */) +BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol */) { BOOST_MATH_STD_USING a += tools::epsilon() * a; @@ -69,7 +73,7 @@ void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol // This is where all the work is done: // template -typename Dist::value_type +BOOST_MATH_GPU_ENABLED typename Dist::value_type do_inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -78,7 +82,7 @@ typename Dist::value_type const typename Dist::value_type& multiplier, typename Dist::value_type adder, const Tolerance& tol, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { typedef typename Dist::value_type value_type; typedef typename Dist::policy_type policy_type; @@ -100,7 +104,7 @@ typename Dist::value_type guess = min_bound; value_type fa = f(guess); - std::uintmax_t count = max_iter - 1; + boost::math::uintmax_t count = max_iter - 1; value_type fb(fa), a(guess), b =0; // Compiler warning C4701: potentially uninitialized local variable 'b' used if(fa == 0) @@ -130,7 +134,7 @@ typename Dist::value_type else { b = a; - a = (std::max)(value_type(b - 1), value_type(0)); + a = BOOST_MATH_GPU_SAFE_MAX(value_type(b - 1), value_type(0)); if(a < min_bound) a = min_bound; fa = f(a); @@ -153,7 +157,7 @@ typename Dist::value_type // If we're looking for a large result, then bump "adder" up // by a bit to increase our chances of bracketing the root: // - //adder = (std::max)(adder, 0.001f * guess); + //adder = BOOST_MATH_GPU_SAFE_MAX(adder, 0.001f * guess); if(fa < 0) { b = a + adder; @@ -162,7 +166,7 @@ typename Dist::value_type } else { - b = (std::max)(value_type(a - adder), value_type(0)); + b = BOOST_MATH_GPU_SAFE_MAX(value_type(a - adder), value_type(0)); if(b < min_bound) b = min_bound; } @@ -186,7 +190,7 @@ typename Dist::value_type } else { - b = (std::max)(value_type(a - adder), value_type(0)); + b = BOOST_MATH_GPU_SAFE_MAX(value_type(a - adder), value_type(0)); if(b < min_bound) b = min_bound; } @@ -195,9 +199,8 @@ typename Dist::value_type } if(a > b) { - using std::swap; - swap(a, b); - swap(fa, fb); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(fa, fb); } } // @@ -274,7 +277,7 @@ typename Dist::value_type // // Go ahead and find the root: // - std::pair r = toms748_solve(f, a, b, fa, fb, tol, count, policy_type()); + boost::math::pair r = toms748_solve(f, a, b, fa, fb, tol, count, policy_type()); max_iter += count; if (max_iter >= policies::get_max_root_iterations()) { @@ -293,7 +296,7 @@ typename Dist::value_type // is very close 1. // template -inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c) +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c) { BOOST_MATH_STD_USING typename Dist::value_type cc = ceil(result); @@ -325,7 +328,7 @@ inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::va #endif template -inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c) +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c) { BOOST_MATH_STD_USING typename Dist::value_type cc = floor(result); @@ -339,7 +342,11 @@ inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::val // while(true) { + #ifdef BOOST_MATH_HAS_GPU_SUPPORT + cc = ceil(nextafter(result, tools::max_value())); + #else cc = ceil(float_next(result)); + #endif if(cc > support(d).second) break; pp = c ? cdf(complement(d, cc)) : cdf(d, cc); @@ -362,7 +369,7 @@ inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::val // to an int where required. // template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, typename Dist::value_type p, @@ -371,7 +378,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { if(p > 0.5) { @@ -393,7 +400,7 @@ inline typename Dist::value_type } template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -402,7 +409,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { typedef typename Dist::value_type value_type; BOOST_MATH_STD_USING @@ -436,7 +443,7 @@ inline typename Dist::value_type } template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -445,7 +452,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { typedef typename Dist::value_type value_type; BOOST_MATH_STD_USING @@ -479,7 +486,7 @@ inline typename Dist::value_type } template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -488,7 +495,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { typedef typename Dist::value_type value_type; BOOST_MATH_STD_USING @@ -507,7 +514,7 @@ inline typename Dist::value_type } template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -516,7 +523,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { BOOST_MATH_STD_USING typename Dist::value_type pp = c ? 1 - p : p; @@ -534,7 +541,7 @@ inline typename Dist::value_type } template -inline typename Dist::value_type +BOOST_MATH_GPU_ENABLED inline typename Dist::value_type inverse_discrete_quantile( const Dist& dist, const typename Dist::value_type& p, @@ -543,7 +550,7 @@ inline typename Dist::value_type const typename Dist::value_type& multiplier, const typename Dist::value_type& adder, const policies::discrete_quantile&, - std::uintmax_t& max_iter) + boost::math::uintmax_t& max_iter) { typedef typename Dist::value_type value_type; BOOST_MATH_STD_USING diff --git a/include/boost/math/distributions/fisher_f.hpp b/include/boost/math/distributions/fisher_f.hpp index e22cdf50a..56b288d88 100644 --- a/include/boost/math/distributions/fisher_f.hpp +++ b/include/boost/math/distributions/fisher_f.hpp @@ -1,5 +1,5 @@ // Copyright John Maddock 2006. - +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt @@ -8,14 +8,15 @@ #ifndef BOOST_MATH_DISTRIBUTIONS_FISHER_F_HPP #define BOOST_MATH_DISTRIBUTIONS_FISHER_F_HPP +#include +#include +#include #include #include // for incomplete beta. #include // complements #include // error checks #include -#include - namespace boost{ namespace math{ template > @@ -25,9 +26,9 @@ public: typedef RealType value_type; typedef Policy policy_type; - fisher_f_distribution(const RealType& i, const RealType& j) : m_df1(i), m_df2(j) + BOOST_MATH_GPU_ENABLED fisher_f_distribution(const RealType& i, const RealType& j) : m_df1(i), m_df2(j) { - static const char* function = "fisher_f_distribution<%1%>::fisher_f_distribution"; + constexpr auto function = "fisher_f_distribution<%1%>::fisher_f_distribution"; RealType result; detail::check_df( function, m_df1, &result, Policy()); @@ -35,11 +36,11 @@ public: function, m_df2, &result, Policy()); } // fisher_f_distribution - RealType degrees_of_freedom1()const + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom1()const { return m_df1; } - RealType degrees_of_freedom2()const + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom2()const { return m_df2; } @@ -60,29 +61,29 @@ fisher_f_distribution(RealType,RealType)->fisher_f_distribution -inline const std::pair range(const fisher_f_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline const boost::math::pair range(const fisher_f_distribution& /*dist*/) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -inline const std::pair support(const fisher_f_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline const boost::math::pair support(const fisher_f_distribution& /*dist*/) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -RealType pdf(const fisher_f_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED RealType pdf(const fisher_f_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: RealType error_result = 0; - static const char* function = "boost::math::pdf(fisher_f_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::pdf(fisher_f_distribution<%1%> const&, %1%)"; if(false == (detail::check_df( function, df1, &error_result, Policy()) && detail::check_df( @@ -132,9 +133,9 @@ RealType pdf(const fisher_f_distribution& dist, const RealType } // pdf template -inline RealType cdf(const fisher_f_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const fisher_f_distribution& dist, const RealType& x) { - static const char* function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)"; RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: @@ -167,9 +168,9 @@ inline RealType cdf(const fisher_f_distribution& dist, const R } // cdf template -inline RealType quantile(const fisher_f_distribution& dist, const RealType& p) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const fisher_f_distribution& dist, const RealType& p) { - static const char* function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)"; RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: @@ -192,9 +193,9 @@ inline RealType quantile(const fisher_f_distribution& dist, co } // quantile template -inline RealType cdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { - static const char* function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)"; RealType df1 = c.dist.degrees_of_freedom1(); RealType df2 = c.dist.degrees_of_freedom2(); RealType x = c.param; @@ -228,9 +229,9 @@ inline RealType cdf(const complemented2_type -inline RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { - static const char* function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)"; RealType df1 = c.dist.degrees_of_freedom1(); RealType df2 = c.dist.degrees_of_freedom2(); RealType p = c.param; @@ -252,9 +253,9 @@ inline RealType quantile(const complemented2_type -inline RealType mean(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mean(const fisher_f_distribution& dist) { // Mean of F distribution = v. - static const char* function = "boost::math::mean(fisher_f_distribution<%1%> const&)"; + constexpr auto function = "boost::math::mean(fisher_f_distribution<%1%> const&)"; RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: @@ -273,9 +274,9 @@ inline RealType mean(const fisher_f_distribution& dist) } // mean template -inline RealType variance(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType variance(const fisher_f_distribution& dist) { // Variance of F distribution. - static const char* function = "boost::math::variance(fisher_f_distribution<%1%> const&)"; + constexpr auto function = "boost::math::variance(fisher_f_distribution<%1%> const&)"; RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: @@ -294,9 +295,9 @@ inline RealType variance(const fisher_f_distribution& dist) } // variance template -inline RealType mode(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mode(const fisher_f_distribution& dist) { - static const char* function = "boost::math::mode(fisher_f_distribution<%1%> const&)"; + constexpr auto function = "boost::math::mode(fisher_f_distribution<%1%> const&)"; RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); // Error check: @@ -317,15 +318,15 @@ inline RealType mode(const fisher_f_distribution& dist) //template //inline RealType median(const fisher_f_distribution& dist) //{ // Median of Fisher F distribution is not defined. -// return tools::domain_error(BOOST_CURRENT_FUNCTION, "Median is not implemented, result is %1%!", std::numeric_limits::quiet_NaN()); +// return tools::domain_error(BOOST_CURRENT_FUNCTION, "Median is not implemented, result is %1%!", boost::math::numeric_limits::quiet_NaN()); // } // median // Now implemented via quantile(half) in derived accessors. template -inline RealType skewness(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType skewness(const fisher_f_distribution& dist) { - static const char* function = "boost::math::skewness(fisher_f_distribution<%1%> const&)"; + constexpr auto function = "boost::math::skewness(fisher_f_distribution<%1%> const&)"; BOOST_MATH_STD_USING // ADL of std names // See http://mathworld.wolfram.com/F-Distribution.html RealType df1 = dist.degrees_of_freedom1(); @@ -346,18 +347,18 @@ inline RealType skewness(const fisher_f_distribution& dist) } template -RealType kurtosis_excess(const fisher_f_distribution& dist); +BOOST_MATH_GPU_ENABLED RealType kurtosis_excess(const fisher_f_distribution& dist); template -inline RealType kurtosis(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const fisher_f_distribution& dist) { return 3 + kurtosis_excess(dist); } template -inline RealType kurtosis_excess(const fisher_f_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const fisher_f_distribution& dist) { - static const char* function = "boost::math::kurtosis_excess(fisher_f_distribution<%1%> const&)"; + constexpr auto function = "boost::math::kurtosis_excess(fisher_f_distribution<%1%> const&)"; // See http://mathworld.wolfram.com/F-Distribution.html RealType df1 = dist.degrees_of_freedom1(); RealType df2 = dist.degrees_of_freedom2(); diff --git a/include/boost/math/distributions/gamma.hpp b/include/boost/math/distributions/gamma.hpp index 28b7c55b0..5176f906d 100644 --- a/include/boost/math/distributions/gamma.hpp +++ b/include/boost/math/distributions/gamma.hpp @@ -1,4 +1,5 @@ // Copyright John Maddock 2006. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -10,22 +11,22 @@ // http://mathworld.wolfram.com/GammaDistribution.html // http://en.wikipedia.org/wiki/Gamma_distribution +#include +#include +#include #include #include #include #include #include -#include -#include - namespace boost{ namespace math { namespace detail { template -inline bool check_gamma_shape( +BOOST_MATH_GPU_ENABLED inline bool check_gamma_shape( const char* function, RealType shape, RealType* result, const Policy& pol) @@ -41,7 +42,7 @@ inline bool check_gamma_shape( } template -inline bool check_gamma_x( +BOOST_MATH_GPU_ENABLED inline bool check_gamma_x( const char* function, RealType const& x, RealType* result, const Policy& pol) @@ -57,7 +58,7 @@ inline bool check_gamma_x( } template -inline bool check_gamma( +BOOST_MATH_GPU_ENABLED inline bool check_gamma( const char* function, RealType scale, RealType shape, @@ -75,19 +76,19 @@ public: using value_type = RealType; using policy_type = Policy; - explicit gamma_distribution(RealType l_shape, RealType l_scale = 1) + BOOST_MATH_GPU_ENABLED explicit gamma_distribution(RealType l_shape, RealType l_scale = 1) : m_shape(l_shape), m_scale(l_scale) { RealType result; detail::check_gamma("boost::math::gamma_distribution<%1%>::gamma_distribution", l_scale, l_shape, &result, Policy()); } - RealType shape()const + BOOST_MATH_GPU_ENABLED RealType shape()const { return m_shape; } - RealType scale()const + BOOST_MATH_GPU_ENABLED RealType scale()const { return m_scale; } @@ -109,27 +110,27 @@ gamma_distribution(RealType,RealType)->gamma_distribution -inline std::pair range(const gamma_distribution& /* dist */) +BOOST_MATH_GPU_ENABLED inline boost::math::pair range(const gamma_distribution& /* dist */) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -inline std::pair support(const gamma_distribution& /* dist */) +BOOST_MATH_GPU_ENABLED inline boost::math::pair support(const gamma_distribution& /* dist */) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. using boost::math::tools::max_value; using boost::math::tools::min_value; - return std::pair(min_value(), max_value()); + return boost::math::pair(min_value(), max_value()); } template -inline RealType pdf(const gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType pdf(const gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::pdf(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::pdf(const gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -149,17 +150,17 @@ inline RealType pdf(const gamma_distribution& dist, const Real } // pdf template -inline RealType logpdf(const gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType logpdf(const gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions using boost::math::lgamma; - static const char* function = "boost::math::logpdf(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::logpdf(const gamma_distribution<%1%>&, %1%)"; RealType k = dist.shape(); RealType theta = dist.scale(); - RealType result = -std::numeric_limits::infinity(); + RealType result = -boost::math::numeric_limits::infinity(); if(false == detail::check_gamma(function, theta, k, &result, Policy())) return result; if(false == detail::check_gamma_x(function, x, &result, Policy())) @@ -167,7 +168,7 @@ inline RealType logpdf(const gamma_distribution& dist, const R if(x == 0) { - return std::numeric_limits::quiet_NaN(); + return boost::math::numeric_limits::quiet_NaN(); } result = -k*log(theta) + (k-1)*log(x) - lgamma(k) - (x/theta); @@ -176,11 +177,11 @@ inline RealType logpdf(const gamma_distribution& dist, const R } // logpdf template -inline RealType cdf(const gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::cdf(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -196,11 +197,11 @@ inline RealType cdf(const gamma_distribution& dist, const Real } // cdf template -inline RealType quantile(const gamma_distribution& dist, const RealType& p) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const gamma_distribution& dist, const RealType& p) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -220,11 +221,11 @@ inline RealType quantile(const gamma_distribution& dist, const } template -inline RealType cdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -241,11 +242,11 @@ inline RealType cdf(const complemented2_type -inline RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -266,11 +267,11 @@ inline RealType quantile(const complemented2_type -inline RealType mean(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mean(const gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::mean(const gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::mean(const gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -284,11 +285,11 @@ inline RealType mean(const gamma_distribution& dist) } template -inline RealType variance(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType variance(const gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::variance(const gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::variance(const gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -302,11 +303,11 @@ inline RealType variance(const gamma_distribution& dist) } template -inline RealType mode(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mode(const gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::mode(const gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::mode(const gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -331,11 +332,11 @@ inline RealType mode(const gamma_distribution& dist) //} template -inline RealType skewness(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType skewness(const gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::skewness(const gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::skewness(const gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -349,11 +350,11 @@ inline RealType skewness(const gamma_distribution& dist) } template -inline RealType kurtosis_excess(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::kurtosis_excess(const gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::kurtosis_excess(const gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -367,18 +368,19 @@ inline RealType kurtosis_excess(const gamma_distribution& dist } template -inline RealType kurtosis(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const gamma_distribution& dist) { return kurtosis_excess(dist) + 3; } template -inline RealType entropy(const gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType entropy(const gamma_distribution& dist) { + BOOST_MATH_STD_USING + RealType k = dist.shape(); RealType theta = dist.scale(); - using std::log; - return k + log(theta) + lgamma(k) + (1-k)*digamma(k); + return k + log(theta) + boost::math::lgamma(k) + (1-k)*digamma(k); } } // namespace math diff --git a/include/boost/math/distributions/geometric.hpp b/include/boost/math/distributions/geometric.hpp index 7c511ef2d..0a7b383c2 100644 --- a/include/boost/math/distributions/geometric.hpp +++ b/include/boost/math/distributions/geometric.hpp @@ -36,6 +36,9 @@ #ifndef BOOST_MATH_SPECIAL_GEOMETRIC_HPP #define BOOST_MATH_SPECIAL_GEOMETRIC_HPP +#include +#include +#include #include #include // for ibeta(a, b, x) == Ix(a, b). #include // complement. @@ -45,10 +48,6 @@ #include #include -#include // using std::numeric_limits; -#include -#include - #if defined (BOOST_MSVC) # pragma warning(push) // This believed not now necessary, so commented out. @@ -64,7 +63,7 @@ namespace boost { // Common error checking routines for geometric distribution function: template - inline bool check_success_fraction(const char* function, const RealType& p, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_success_fraction(const char* function, const RealType& p, RealType* result, const Policy& pol) { if( !(boost::math::isfinite)(p) || (p < 0) || (p > 1) ) { @@ -77,13 +76,13 @@ namespace boost } template - inline bool check_dist(const char* function, const RealType& p, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist(const char* function, const RealType& p, RealType* result, const Policy& pol) { return check_success_fraction(function, p, result, pol); } template - inline bool check_dist_and_k(const char* function, const RealType& p, RealType k, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist_and_k(const char* function, const RealType& p, RealType k, RealType* result, const Policy& pol) { if(check_dist(function, p, result, pol) == false) { @@ -100,7 +99,7 @@ namespace boost } // Check_dist_and_k template - inline bool check_dist_and_prob(const char* function, RealType p, RealType prob, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist_and_prob(const char* function, RealType p, RealType prob, RealType* result, const Policy& pol) { if((check_dist(function, p, result, pol) && detail::check_probability(function, prob, result, pol)) == false) { @@ -117,7 +116,7 @@ namespace boost typedef RealType value_type; typedef Policy policy_type; - geometric_distribution(RealType p) : m_p(p) + BOOST_MATH_GPU_ENABLED geometric_distribution(RealType p) : m_p(p) { // Constructor stores success_fraction p. RealType result; geometric_detail::check_dist( @@ -127,22 +126,22 @@ namespace boost } // geometric_distribution constructor. // Private data getter class member functions. - RealType success_fraction() const + BOOST_MATH_GPU_ENABLED RealType success_fraction() const { // Probability of success as fraction in range 0 to 1. return m_p; } - RealType successes() const + BOOST_MATH_GPU_ENABLED RealType successes() const { // Total number of successes r = 1 (for compatibility with negative binomial?). return 1; } // Parameter estimation. // (These are copies of negative_binomial distribution with successes = 1). - static RealType find_lower_bound_on_p( + BOOST_MATH_GPU_ENABLED static RealType find_lower_bound_on_p( RealType trials, RealType alpha) // alpha 0.05 equivalent to 95% for one-sided test. { - static const char* function = "boost::math::geometric<%1%>::find_lower_bound_on_p"; + constexpr auto function = "boost::math::geometric<%1%>::find_lower_bound_on_p"; RealType result = 0; // of error checks. RealType successes = 1; RealType failures = trials - successes; @@ -163,11 +162,11 @@ namespace boost return ibeta_inv(successes, failures + 1, alpha, static_cast(nullptr), Policy()); } // find_lower_bound_on_p - static RealType find_upper_bound_on_p( + BOOST_MATH_GPU_ENABLED static RealType find_upper_bound_on_p( RealType trials, RealType alpha) // alpha 0.05 equivalent to 95% for one-sided test. { - static const char* function = "boost::math::geometric<%1%>::find_upper_bound_on_p"; + constexpr auto function = "boost::math::geometric<%1%>::find_upper_bound_on_p"; RealType result = 0; // of error checks. RealType successes = 1; RealType failures = trials - successes; @@ -195,12 +194,12 @@ namespace boost // Estimate number of trials : // "How many trials do I need to be P% sure of seeing k or fewer failures?" - static RealType find_minimum_number_of_trials( + BOOST_MATH_GPU_ENABLED static RealType find_minimum_number_of_trials( RealType k, // number of failures (k >= 0). RealType p, // success fraction 0 <= p <= 1. RealType alpha) // risk level threshold 0 <= alpha <= 1. { - static const char* function = "boost::math::geometric<%1%>::find_minimum_number_of_trials"; + constexpr auto function = "boost::math::geometric<%1%>::find_minimum_number_of_trials"; // Error checks: RealType result = 0; if(false == geometric_detail::check_dist_and_k( @@ -213,12 +212,12 @@ namespace boost return result + k; } // RealType find_number_of_failures - static RealType find_maximum_number_of_trials( + BOOST_MATH_GPU_ENABLED static RealType find_maximum_number_of_trials( RealType k, // number of failures (k >= 0). RealType p, // success fraction 0 <= p <= 1. RealType alpha) // risk level threshold 0 <= alpha <= 1. { - static const char* function = "boost::math::geometric<%1%>::find_maximum_number_of_trials"; + constexpr auto function = "boost::math::geometric<%1%>::find_maximum_number_of_trials"; // Error checks: RealType result = 0; if(false == geometric_detail::check_dist_and_k( @@ -244,22 +243,22 @@ namespace boost #endif template - inline const std::pair range(const geometric_distribution& /* dist */) + BOOST_MATH_GPU_ENABLED inline const boost::math::pair range(const geometric_distribution& /* dist */) { // Range of permissible values for random variable k. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); // max_integer? + return boost::math::pair(static_cast(0), max_value()); // max_integer? } template - inline const std::pair support(const geometric_distribution& /* dist */) + BOOST_MATH_GPU_ENABLED inline const boost::math::pair support(const geometric_distribution& /* dist */) { // Range of supported values for random variable k. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); // max_integer? + return boost::math::pair(static_cast(0), max_value()); // max_integer? } template - inline RealType mean(const geometric_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType mean(const geometric_distribution& dist) { // Mean of geometric distribution = (1-p)/p. return (1 - dist.success_fraction() ) / dist.success_fraction(); } // mean @@ -267,21 +266,21 @@ namespace boost // median implemented via quantile(half) in derived accessors. template - inline RealType mode(const geometric_distribution&) + BOOST_MATH_GPU_ENABLED inline RealType mode(const geometric_distribution&) { // Mode of geometric distribution = zero. BOOST_MATH_STD_USING // ADL of std functions. return 0; } // mode template - inline RealType variance(const geometric_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType variance(const geometric_distribution& dist) { // Variance of Binomial distribution = (1-p) / p^2. return (1 - dist.success_fraction()) / (dist.success_fraction() * dist.success_fraction()); } // variance template - inline RealType skewness(const geometric_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType skewness(const geometric_distribution& dist) { // skewness of geometric distribution = 2-p / (sqrt(r(1-p)) BOOST_MATH_STD_USING // ADL of std functions. RealType p = dist.success_fraction(); @@ -289,7 +288,7 @@ namespace boost } // skewness template - inline RealType kurtosis(const geometric_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const geometric_distribution& dist) { // kurtosis of geometric distribution // http://en.wikipedia.org/wiki/geometric is kurtosis_excess so add 3 RealType p = dist.success_fraction(); @@ -297,7 +296,7 @@ namespace boost } // kurtosis template - inline RealType kurtosis_excess(const geometric_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const geometric_distribution& dist) { // kurtosis excess of geometric distribution // http://mathworld.wolfram.com/Kurtosis.html table of kurtosis_excess RealType p = dist.success_fraction(); @@ -312,11 +311,11 @@ namespace boost // chf of geometric distribution provided by derived accessors. template - inline RealType pdf(const geometric_distribution& dist, const RealType& k) + BOOST_MATH_GPU_ENABLED inline RealType pdf(const geometric_distribution& dist, const RealType& k) { // Probability Density/Mass Function. BOOST_FPU_EXCEPTION_GUARD BOOST_MATH_STD_USING // For ADL of math functions. - static const char* function = "boost::math::pdf(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::pdf(const geometric_distribution<%1%>&, %1%)"; RealType p = dist.success_fraction(); RealType result = 0; @@ -350,9 +349,9 @@ namespace boost } // geometric_pdf template - inline RealType cdf(const geometric_distribution& dist, const RealType& k) + BOOST_MATH_GPU_ENABLED inline RealType cdf(const geometric_distribution& dist, const RealType& k) { // Cumulative Distribution Function of geometric. - static const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)"; // k argument may be integral, signed, or unsigned, or floating point. // If necessary, it has already been promoted from an integral type. @@ -381,12 +380,10 @@ namespace boost } // cdf Cumulative Distribution Function geometric. template - inline RealType logcdf(const geometric_distribution& dist, const RealType& k) + BOOST_MATH_GPU_ENABLED inline RealType logcdf(const geometric_distribution& dist, const RealType& k) { // Cumulative Distribution Function of geometric. - using std::pow; - using std::log; - using std::exp; - static const char* function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)"; + BOOST_MATH_STD_USING + constexpr auto function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)"; // k argument may be integral, signed, or unsigned, or floating point. // If necessary, it has already been promoted from an integral type. @@ -399,7 +396,7 @@ namespace boost k, &result, Policy())) { - return -std::numeric_limits::infinity(); + return -boost::math::numeric_limits::infinity(); } if(k == 0) { @@ -413,10 +410,10 @@ namespace boost } // logcdf Cumulative Distribution Function geometric. template - inline RealType cdf(const complemented2_type, RealType>& c) + BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { // Complemented Cumulative Distribution Function geometric. BOOST_MATH_STD_USING - static const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)"; // k argument may be integral, signed, or unsigned, or floating point. // If necessary, it has already been promoted from an integral type. RealType const& k = c.param; @@ -438,10 +435,10 @@ namespace boost } // cdf Complemented Cumulative Distribution Function geometric. template - inline RealType logcdf(const complemented2_type, RealType>& c) + BOOST_MATH_GPU_ENABLED inline RealType logcdf(const complemented2_type, RealType>& c) { // Complemented Cumulative Distribution Function geometric. BOOST_MATH_STD_USING - static const char* function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)"; // k argument may be integral, signed, or unsigned, or floating point. // If necessary, it has already been promoted from an integral type. RealType const& k = c.param; @@ -455,21 +452,21 @@ namespace boost k, &result, Policy())) { - return -std::numeric_limits::infinity(); + return -boost::math::numeric_limits::infinity(); } return boost::math::log1p(-p, Policy()) * (k+1); } // logcdf Complemented Cumulative Distribution Function geometric. template - inline RealType quantile(const geometric_distribution& dist, const RealType& x) + BOOST_MATH_GPU_ENABLED inline RealType quantile(const geometric_distribution& dist, const RealType& x) { // Quantile, percentile/100 or Percent Point geometric function. // Return the number of expected failures k for a given probability p. // Inverse cumulative Distribution Function or Quantile (percentile / 100) of geometric Probability. // k argument may be integral, signed, or unsigned, or floating point. - static const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)"; BOOST_MATH_STD_USING // ADL of std functions. RealType success_fraction = dist.success_fraction(); @@ -513,11 +510,11 @@ namespace boost } // RealType quantile(const geometric_distribution dist, p) template - inline RealType quantile(const complemented2_type, RealType>& c) + BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { // Quantile or Percent Point Binomial function. // Return the number of expected failures k for a given // complement of the probability Q = 1 - P. - static const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)"; BOOST_MATH_STD_USING // Error checks: RealType x = c.param; diff --git a/include/boost/math/distributions/inverse_chi_squared.hpp b/include/boost/math/distributions/inverse_chi_squared.hpp index 19dd0371e..1a3c680d2 100644 --- a/include/boost/math/distributions/inverse_chi_squared.hpp +++ b/include/boost/math/distributions/inverse_chi_squared.hpp @@ -1,6 +1,6 @@ // Copyright John Maddock 2010. // Copyright Paul A. Bristow 2010. - +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt @@ -9,6 +9,8 @@ #ifndef BOOST_MATH_DISTRIBUTIONS_INVERSE_CHI_SQUARED_HPP #define BOOST_MATH_DISTRIBUTIONS_INVERSE_CHI_SQUARED_HPP +#include +#include #include #include // for incomplete beta. #include // for complements. @@ -24,14 +26,12 @@ // Weisstein, Eric W. "Inverse Chi-Squared Distribution." From MathWorld--A Wolfram Web Resource. // http://mathworld.wolfram.com/InverseChi-SquaredDistribution.html -#include - namespace boost{ namespace math{ namespace detail { template - inline bool check_inverse_chi_squared( // Check both distribution parameters. + BOOST_MATH_GPU_ENABLED inline bool check_inverse_chi_squared( // Check both distribution parameters. const char* function, RealType degrees_of_freedom, // degrees_of_freedom (aka nu). RealType scale, // scale (aka sigma^2) @@ -51,7 +51,7 @@ public: typedef RealType value_type; typedef Policy policy_type; - inverse_chi_squared_distribution(RealType df, RealType l_scale) : m_df(df), m_scale (l_scale) + BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df, RealType l_scale) : m_df(df), m_scale (l_scale) { RealType result; detail::check_df( @@ -62,7 +62,7 @@ public: m_scale, &result, Policy()); } // inverse_chi_squared_distribution constructor - inverse_chi_squared_distribution(RealType df = 1) : m_df(df) + BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1) : m_df(df) { RealType result; m_scale = 1 / m_df ; // Default scale = 1 / degrees of freedom (Wikipedia definition 1). @@ -71,11 +71,11 @@ public: m_df, &result, Policy()); } // inverse_chi_squared_distribution - RealType degrees_of_freedom()const + BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const { return m_df; // aka nu } - RealType scale()const + BOOST_MATH_GPU_ENABLED RealType scale()const { return m_scale; // aka xi } @@ -105,28 +105,28 @@ inverse_chi_squared_distribution(RealType,RealType)->inverse_chi_squared_distrib #endif template -inline const std::pair range(const inverse_chi_squared_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline const boost::math::pair range(const inverse_chi_squared_distribution& /*dist*/) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); // 0 to + infinity. + return boost::math::pair(static_cast(0), max_value()); // 0 to + infinity. } template -inline const std::pair support(const inverse_chi_squared_distribution& /*dist*/) +BOOST_MATH_GPU_ENABLED inline const boost::math::pair support(const inverse_chi_squared_distribution& /*dist*/) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. - return std::pair(static_cast(0), tools::max_value()); // 0 to + infinity. + return boost::math::pair(static_cast(0), tools::max_value()); // 0 to + infinity. } template -RealType pdf(const inverse_chi_squared_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED RealType pdf(const inverse_chi_squared_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions. RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); RealType error_result; - static const char* function = "boost::math::pdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::pdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; if(false == detail::check_inverse_chi_squared (function, df, scale, &error_result, Policy()) @@ -159,9 +159,9 @@ RealType pdf(const inverse_chi_squared_distribution& dist, con } // pdf template -inline RealType cdf(const inverse_chi_squared_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const inverse_chi_squared_distribution& dist, const RealType& x) { - static const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); RealType error_result; @@ -188,13 +188,13 @@ inline RealType cdf(const inverse_chi_squared_distribution& di } // cdf template -inline RealType quantile(const inverse_chi_squared_distribution& dist, const RealType& p) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const inverse_chi_squared_distribution& dist, const RealType& p) { using boost::math::gamma_q_inv; RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); - static const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)"; // Error check: RealType error_result; if(false == detail::check_df( @@ -220,13 +220,13 @@ inline RealType quantile(const inverse_chi_squared_distribution -inline RealType cdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { using boost::math::gamma_q_inv; RealType const& df = c.dist.degrees_of_freedom(); RealType const& scale = c.dist.scale(); RealType const& x = c.param; - static const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)"; // Error check: RealType error_result; if(false == detail::check_df( @@ -251,14 +251,14 @@ inline RealType cdf(const complemented2_type -inline RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { using boost::math::gamma_q_inv; RealType const& df = c.dist.degrees_of_freedom(); RealType const& scale = c.dist.scale(); RealType const& q = c.param; - static const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)"; // Error check: RealType error_result; if(false == detail::check_df(function, df, &error_result, Policy())) @@ -280,12 +280,12 @@ inline RealType quantile(const complemented2_type -inline RealType mean(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mean(const inverse_chi_squared_distribution& dist) { // Mean of inverse Chi-Squared distribution. RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); - static const char* function = "boost::math::mean(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::mean(const inverse_chi_squared_distribution<%1%>&)"; if(df <= 2) return policies::raise_domain_error( function, @@ -295,11 +295,11 @@ inline RealType mean(const inverse_chi_squared_distribution& d } // mean template -inline RealType variance(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType variance(const inverse_chi_squared_distribution& dist) { // Variance of inverse Chi-Squared distribution. RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); - static const char* function = "boost::math::variance(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::variance(const inverse_chi_squared_distribution<%1%>&)"; if(df <= 4) { return policies::raise_domain_error( @@ -311,14 +311,14 @@ inline RealType variance(const inverse_chi_squared_distribution -inline RealType mode(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mode(const inverse_chi_squared_distribution& dist) { // mode is not defined in Mathematica. // See Discussion section http://en.wikipedia.org/wiki/Talk:Scaled-inverse-chi-square_distribution // for origin of the formula used below. RealType df = dist.degrees_of_freedom(); RealType scale = dist.scale(); - static const char* function = "boost::math::mode(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::mode(const inverse_chi_squared_distribution<%1%>&)"; if(df < 0) return policies::raise_domain_error( function, @@ -341,11 +341,11 @@ inline RealType mode(const inverse_chi_squared_distribution& d // Now implemented via quantile(half) in derived accessors. template -inline RealType skewness(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType skewness(const inverse_chi_squared_distribution& dist) { BOOST_MATH_STD_USING // For ADL RealType df = dist.degrees_of_freedom(); - static const char* function = "boost::math::skewness(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::skewness(const inverse_chi_squared_distribution<%1%>&)"; if(df <= 6) return policies::raise_domain_error( function, @@ -356,10 +356,10 @@ inline RealType skewness(const inverse_chi_squared_distribution -inline RealType kurtosis(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const inverse_chi_squared_distribution& dist) { RealType df = dist.degrees_of_freedom(); - static const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)"; if(df <= 8) return policies::raise_domain_error( function, @@ -370,10 +370,10 @@ inline RealType kurtosis(const inverse_chi_squared_distribution -inline RealType kurtosis_excess(const inverse_chi_squared_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const inverse_chi_squared_distribution& dist) { RealType df = dist.degrees_of_freedom(); - static const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)"; + constexpr auto function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)"; if(df <= 8) return policies::raise_domain_error( function, diff --git a/include/boost/math/distributions/inverse_gamma.hpp b/include/boost/math/distributions/inverse_gamma.hpp index 8c9e4763d..6aa798ed8 100644 --- a/include/boost/math/distributions/inverse_gamma.hpp +++ b/include/boost/math/distributions/inverse_gamma.hpp @@ -2,6 +2,7 @@ // Copyright Paul A. Bristow 2010. // Copyright John Maddock 2010. +// Copyright Matt Borland 2024. // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -22,21 +23,21 @@ // http://mathworld.wolfram.com/GammaDistribution.html // http://en.wikipedia.org/wiki/Gamma_distribution +#include +#include +#include #include #include #include #include -#include -#include - namespace boost{ namespace math { namespace detail { template -inline bool check_inverse_gamma_shape( +BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma_shape( const char* function, // inverse_gamma RealType shape, // shape aka alpha RealType* result, // to update, perhaps with NaN @@ -57,7 +58,7 @@ inline bool check_inverse_gamma_shape( } //bool check_inverse_gamma_shape template -inline bool check_inverse_gamma_x( +BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma_x( const char* function, RealType const& x, RealType* result, const Policy& pol) @@ -73,7 +74,7 @@ inline bool check_inverse_gamma_x( } template -inline bool check_inverse_gamma( +BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma( const char* function, // TODO swap these over, so shape is first. RealType scale, // scale aka beta RealType shape, // shape aka alpha @@ -92,7 +93,7 @@ public: using value_type = RealType; using policy_type = Policy; - explicit inverse_gamma_distribution(RealType l_shape = 1, RealType l_scale = 1) + BOOST_MATH_GPU_ENABLED explicit inverse_gamma_distribution(RealType l_shape = 1, RealType l_scale = 1) : m_shape(l_shape), m_scale(l_scale) { RealType result; @@ -101,12 +102,12 @@ public: l_scale, l_shape, &result, Policy()); } - RealType shape()const + BOOST_MATH_GPU_ENABLED RealType shape()const { return m_shape; } - RealType scale()const + BOOST_MATH_GPU_ENABLED RealType scale()const { return m_scale; } @@ -132,27 +133,27 @@ inverse_gamma_distribution(RealType,RealType)->inverse_gamma_distribution -inline std::pair range(const inverse_gamma_distribution& /* dist */) +BOOST_MATH_GPU_ENABLED inline boost::math::pair range(const inverse_gamma_distribution& /* dist */) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -inline std::pair support(const inverse_gamma_distribution& /* dist */) +BOOST_MATH_GPU_ENABLED inline boost::math::pair support(const inverse_gamma_distribution& /* dist */) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. using boost::math::tools::max_value; using boost::math::tools::min_value; - return std::pair(static_cast(0), max_value()); + return boost::math::pair(static_cast(0), max_value()); } template -inline RealType pdf(const inverse_gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType pdf(const inverse_gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::pdf(const inverse_gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::pdf(const inverse_gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -195,17 +196,17 @@ inline RealType pdf(const inverse_gamma_distribution& dist, co } // pdf template -inline RealType logpdf(const inverse_gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType logpdf(const inverse_gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions using boost::math::lgamma; - static const char* function = "boost::math::logpdf(const inverse_gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::logpdf(const inverse_gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); - RealType result = -std::numeric_limits::infinity(); + RealType result = -boost::math::numeric_limits::infinity(); if(false == detail::check_inverse_gamma(function, scale, shape, &result, Policy())) { // distribution parameters bad. return result; @@ -232,11 +233,11 @@ inline RealType logpdf(const inverse_gamma_distribution& dist, } // pdf template -inline RealType cdf(const inverse_gamma_distribution& dist, const RealType& x) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const inverse_gamma_distribution& dist, const RealType& x) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::cdf(const inverse_gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::cdf(const inverse_gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -260,12 +261,12 @@ inline RealType cdf(const inverse_gamma_distribution& dist, co } // cdf template -inline RealType quantile(const inverse_gamma_distribution& dist, const RealType& p) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const inverse_gamma_distribution& dist, const RealType& p) { BOOST_MATH_STD_USING // for ADL of std functions using boost::math::gamma_q_inv; - static const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -287,11 +288,11 @@ inline RealType quantile(const inverse_gamma_distribution& dis } template -inline RealType cdf(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -310,11 +311,11 @@ inline RealType cdf(const complemented2_type -inline RealType quantile(const complemented2_type, RealType>& c) +BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)"; + constexpr auto function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)"; RealType shape = c.dist.shape(); RealType scale = c.dist.scale(); @@ -338,11 +339,11 @@ inline RealType quantile(const complemented2_type -inline RealType mean(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mean(const inverse_gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::mean(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::mean(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -365,11 +366,11 @@ inline RealType mean(const inverse_gamma_distribution& dist) } // mean template -inline RealType variance(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType variance(const inverse_gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::variance(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::variance(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -391,11 +392,11 @@ inline RealType variance(const inverse_gamma_distribution& dis } template -inline RealType mode(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType mode(const inverse_gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::mode(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::mode(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -418,11 +419,11 @@ inline RealType mode(const inverse_gamma_distribution& dist) //} template -inline RealType skewness(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType skewness(const inverse_gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::skewness(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::skewness(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -444,11 +445,11 @@ inline RealType skewness(const inverse_gamma_distribution& dis } template -inline RealType kurtosis_excess(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const inverse_gamma_distribution& dist) { BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::kurtosis_excess(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::kurtosis_excess(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); @@ -470,9 +471,9 @@ inline RealType kurtosis_excess(const inverse_gamma_distribution -inline RealType kurtosis(const inverse_gamma_distribution& dist) +BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const inverse_gamma_distribution& dist) { - static const char* function = "boost::math::kurtosis(const inverse_gamma_distribution<%1%>&)"; + constexpr auto function = "boost::math::kurtosis(const inverse_gamma_distribution<%1%>&)"; RealType shape = dist.shape(); RealType scale = dist.scale(); diff --git a/include/boost/math/tools/tuple.hpp b/include/boost/math/tools/tuple.hpp index 82d23b8d7..dcc763e37 100644 --- a/include/boost/math/tools/tuple.hpp +++ b/include/boost/math/tools/tuple.hpp @@ -23,6 +23,7 @@ using cuda::std::tuple; using cuda::std::make_pair; +using cuda::std::tie; using cuda::std::get; using cuda::std::tuple_size; diff --git a/test/cuda_jamfile b/test/cuda_jamfile index c9a70e8a9..bae42a8d9 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -64,11 +64,46 @@ run test_extreme_value_pdf_float.cu ; run test_extreme_value_quan_double.cu ; run test_extreme_value_quan_float.cu ; +run test_fisher_f_cdf_double.cu ; +run test_fisher_f_cdf_float.cu ; +run test_fisher_f_pdf_double.cu ; +run test_fisher_f_pdf_float.cu ; +run test_fisher_f_quan_double.cu ; +run test_fisher_f_quan_float.cu ; + +run test_gamma_dist_cdf_double.cu ; +run test_gamma_dist_cdf_float.cu ; +run test_gamma_dist_pdf_double.cu ; +run test_gamma_dist_pdf_float.cu ; +run test_gamma_dist_quan_double.cu ; +run test_gamma_dist_quan_float.cu ; + +run test_geometric_dist_cdf_double.cu ; +run test_geometric_dist_cdf_float.cu ; +run test_geometric_dist_pdf_double.cu ; +run test_geometric_dist_pdf_float.cu ; +run test_geometric_dist_quan_double.cu ; +run test_geometric_dist_quan_float.cu ; + run test_holtsmark_cdf_double.cu ; run test_holtsmark_cdf_float.cu ; run test_holtsmark_pdf_double.cu ; run test_holtsmark_pdf_float.cu ; +run test_inverse_chi_squared_cdf_double.cu ; +run test_inverse_chi_squared_cdf_float.cu ; +run test_inverse_chi_squared_pdf_double.cu ; +run test_inverse_chi_squared_pdf_float.cu ; +run test_inverse_chi_squared_quan_double.cu ; +run test_inverse_chi_squared_quan_float.cu ; + +run test_inverse_gamma_cdf_double.cu ; +run test_inverse_gamma_cdf_float.cu ; +run test_inverse_gamma_pdf_double.cu ; +run test_inverse_gamma_pdf_float.cu ; +run test_inverse_gamma_quan_double.cu ; +run test_inverse_gamma_quan_float.cu ; + run test_landau_cdf_double.cu ; run test_landau_cdf_float.cu ; run test_landau_pdf_double.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 1fc2746a1..1b001eb2f 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -59,6 +59,27 @@ run test_extreme_value_pdf_nvrtc_float.cpp ; run test_extreme_value_quan_nvrtc_double.cpp ; run test_extreme_value_quan_nvrtc_float.cpp ; +run test_fisher_f_cdf_nvrtc_double.cpp ; +run test_fisher_f_cdf_nvrtc_float.cpp ; +run test_fisher_f_pdf_nvrtc_double.cpp ; +run test_fisher_f_pdf_nvrtc_float.cpp ; +run test_fisher_f_quan_nvrtc_double.cpp ; +run test_fisher_f_quan_nvrtc_float.cpp ; + +run test_gamma_dist_cdf_nvrtc_double.cpp ; +run test_gamma_dist_cdf_nvrtc_float.cpp ; +run test_gamma_dist_pdf_nvrtc_double.cpp ; +run test_gamma_dist_pdf_nvrtc_float.cpp ; +run test_gamma_dist_quan_nvrtc_double.cpp ; +run test_gamma_dist_quan_nvrtc_float.cpp ; + +run test_geometric_dist_cdf_nvrtc_double.cpp ; +run test_geometric_dist_cdf_nvrtc_float.cpp ; +run test_geometric_dist_pdf_nvrtc_double.cpp ; +run test_geometric_dist_pdf_nvrtc_float.cpp ; +run test_geometric_dist_quan_nvrtc_double.cpp ; +run test_geometric_dist_quan_nvrtc_float.cpp ; + run test_holtsmark_cdf_nvrtc_double.cpp ; run test_holtsmark_cdf_nvrtc_float.cpp ; run test_holtsmark_pdf_nvrtc_double.cpp ; @@ -66,6 +87,20 @@ run test_holtsmark_pdf_nvrtc_float.cpp ; run test_holtsmark_quan_nvrtc_double.cpp ; run test_holtsmark_quan_nvrtc_float.cpp ; +run test_inverse_chi_squared_cdf_nvrtc_double.cpp ; +run test_inverse_chi_squared_cdf_nvrtc_float.cpp ; +run test_inverse_chi_squared_pdf_nvrtc_double.cpp ; +run test_inverse_chi_squared_pdf_nvrtc_float.cpp ; +run test_inverse_chi_squared_quan_nvrtc_double.cpp ; +run test_inverse_chi_squared_quan_nvrtc_float.cpp ; + +run test_inverse_gamma_cdf_nvrtc_double.cpp ; +run test_inverse_gamma_cdf_nvrtc_float.cpp ; +run test_inverse_gamma_pdf_nvrtc_double.cpp ; +run test_inverse_gamma_pdf_nvrtc_float.cpp ; +run test_inverse_gamma_quan_nvrtc_double.cpp ; +run test_inverse_gamma_quan_nvrtc_float.cpp ; + run test_landau_cdf_nvrtc_double.cpp ; run test_landau_cdf_nvrtc_float.cpp ; run test_landau_pdf_nvrtc_double.cpp ; diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 5d3d85cd8..bcdfb7e5d 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -17,7 +17,12 @@ run test_cauchy.cpp ; run test_chi_squared.cpp ; run test_exponential_dist.cpp ; run test_extreme_value.cpp ; +run test_fisher_f.cpp ; +run test_gamma_dist.cpp ; +run test_geometric.cpp ; run test_holtsmark.cpp ; +run test_inverse_chi_squared_distribution.cpp ; +run test_inverse_gamma_distribution.cpp ; run test_landau.cpp ; run test_laplace.cpp ; run test_logistic_dist.cpp ; diff --git a/test/test_arcsine_cdf_double.cu b/test/test_arcsine_cdf_double.cu index d6f6f7b35..3ac9e22cd 100644 --- a/test/test_arcsine_cdf_double.cu +++ b/test/test_arcsine_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_cdf_float.cu b/test/test_arcsine_cdf_float.cu index 148b1dffb..cc73ce95b 100644 --- a/test/test_arcsine_cdf_float.cu +++ b/test/test_arcsine_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_pdf_double.cu b/test/test_arcsine_pdf_double.cu index 7a73bb34e..8f45017ba 100644 --- a/test/test_arcsine_pdf_double.cu +++ b/test/test_arcsine_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_pdf_float.cu b/test/test_arcsine_pdf_float.cu index 54a11253c..c236b7876 100644 --- a/test/test_arcsine_pdf_float.cu +++ b/test/test_arcsine_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_quan_double.cu b/test/test_arcsine_quan_double.cu index 31f6eac8a..a45737063 100644 --- a/test/test_arcsine_quan_double.cu +++ b/test/test_arcsine_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_quan_float.cu b/test/test_arcsine_quan_float.cu index 6decb347b..fd8cd11fc 100644 --- a/test/test_arcsine_quan_float.cu +++ b/test/test_arcsine_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_range_support_double.cu b/test/test_arcsine_range_support_double.cu index cec919a1a..b3fb575fa 100644 --- a/test/test_arcsine_range_support_double.cu +++ b/test/test_arcsine_range_support_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_arcsine_range_support_float.cu b/test/test_arcsine_range_support_float.cu index d397e0c86..d207d0598 100644 --- a/test/test_arcsine_range_support_float.cu +++ b/test/test_arcsine_range_support_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_cdf_double.cu b/test/test_bernoulli_cdf_double.cu index e4c21ca06..1a6dce645 100644 --- a/test/test_bernoulli_cdf_double.cu +++ b/test/test_bernoulli_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_cdf_float.cu b/test/test_bernoulli_cdf_float.cu index 82c0eabc0..998f24736 100644 --- a/test/test_bernoulli_cdf_float.cu +++ b/test/test_bernoulli_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_pdf_double.cu b/test/test_bernoulli_pdf_double.cu index 24b33c16c..147e2f340 100644 --- a/test/test_bernoulli_pdf_double.cu +++ b/test/test_bernoulli_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_pdf_float.cu b/test/test_bernoulli_pdf_float.cu index 08d2ca5a0..49eaea32f 100644 --- a/test/test_bernoulli_pdf_float.cu +++ b/test/test_bernoulli_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_range_support_double.cu b/test/test_bernoulli_range_support_double.cu index 86c77bd11..ade952fca 100644 --- a/test/test_bernoulli_range_support_double.cu +++ b/test/test_bernoulli_range_support_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_bernoulli_range_support_float.cu b/test/test_bernoulli_range_support_float.cu index cdcf54418..ef276b938 100644 --- a/test/test_bernoulli_range_support_float.cu +++ b/test/test_bernoulli_range_support_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_cdf_double.cu b/test/test_beta_dist_cdf_double.cu index fa460244a..9188f4305 100644 --- a/test/test_beta_dist_cdf_double.cu +++ b/test/test_beta_dist_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_cdf_float.cu b/test/test_beta_dist_cdf_float.cu index 321c84420..0278f6415 100644 --- a/test/test_beta_dist_cdf_float.cu +++ b/test/test_beta_dist_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_pdf_double.cu b/test/test_beta_dist_pdf_double.cu index c0ee9272a..e86cf94dd 100644 --- a/test/test_beta_dist_pdf_double.cu +++ b/test/test_beta_dist_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_pdf_float.cu b/test/test_beta_dist_pdf_float.cu index 75e4fa27b..97dd606f2 100644 --- a/test/test_beta_dist_pdf_float.cu +++ b/test/test_beta_dist_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_quan_double.cu b/test/test_beta_dist_quan_double.cu index 101526afa..a6b842e8e 100644 --- a/test/test_beta_dist_quan_double.cu +++ b/test/test_beta_dist_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_beta_dist_quan_float.cu b/test/test_beta_dist_quan_float.cu index 77696c639..48a860f4c 100644 --- a/test/test_beta_dist_quan_float.cu +++ b/test/test_beta_dist_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_cdf_double.cu b/test/test_cauchy_cdf_double.cu index dc99cbe33..526744ba1 100644 --- a/test/test_cauchy_cdf_double.cu +++ b/test/test_cauchy_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_cdf_float.cu b/test/test_cauchy_cdf_float.cu index dc99cbe33..526744ba1 100644 --- a/test/test_cauchy_cdf_float.cu +++ b/test/test_cauchy_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_pdf_double.cu b/test/test_cauchy_pdf_double.cu index 7a7fe5ba6..62398c31e 100644 --- a/test/test_cauchy_pdf_double.cu +++ b/test/test_cauchy_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_pdf_float.cu b/test/test_cauchy_pdf_float.cu index 5ec3b604b..aff3369b8 100644 --- a/test/test_cauchy_pdf_float.cu +++ b/test/test_cauchy_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_quan_double.cu b/test/test_cauchy_quan_double.cu index 21f4b4dda..0fcaaafe7 100644 --- a/test/test_cauchy_quan_double.cu +++ b/test/test_cauchy_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_quan_float.cu b/test/test_cauchy_quan_float.cu index b6bed1520..9c04c5b12 100644 --- a/test/test_cauchy_quan_float.cu +++ b/test/test_cauchy_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_range_support_double.cu b/test/test_cauchy_range_support_double.cu index 4ec792ce3..3a42c1bd3 100644 --- a/test/test_cauchy_range_support_double.cu +++ b/test/test_cauchy_range_support_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_cauchy_range_support_float.cu b/test/test_cauchy_range_support_float.cu index 1cdd90e40..e713736e6 100644 --- a/test/test_cauchy_range_support_float.cu +++ b/test/test_cauchy_range_support_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_cdf_double.cu b/test/test_chi_squared_cdf_double.cu index 1b0c34ce6..c2475883b 100644 --- a/test/test_chi_squared_cdf_double.cu +++ b/test/test_chi_squared_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_cdf_float.cu b/test/test_chi_squared_cdf_float.cu index 8ca99ed2e..07dce0d06 100644 --- a/test/test_chi_squared_cdf_float.cu +++ b/test/test_chi_squared_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_pdf_double.cu b/test/test_chi_squared_pdf_double.cu index ed45246d3..30edafd05 100644 --- a/test/test_chi_squared_pdf_double.cu +++ b/test/test_chi_squared_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_pdf_float.cu b/test/test_chi_squared_pdf_float.cu index 5a0f97db9..9b205182b 100644 --- a/test/test_chi_squared_pdf_float.cu +++ b/test/test_chi_squared_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_quan_double.cu b/test/test_chi_squared_quan_double.cu index 3b7dad972..3fae7d966 100644 --- a/test/test_chi_squared_quan_double.cu +++ b/test/test_chi_squared_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_chi_squared_quan_float.cu b/test/test_chi_squared_quan_float.cu index 3e779a090..7a717530e 100644 --- a/test/test_chi_squared_quan_float.cu +++ b/test/test_chi_squared_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_cdf_double.cu b/test/test_exponential_cdf_double.cu index 8601d1c08..e3a57e86e 100644 --- a/test/test_exponential_cdf_double.cu +++ b/test/test_exponential_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_cdf_float.cu b/test/test_exponential_cdf_float.cu index aa5ef9153..ed214a495 100644 --- a/test/test_exponential_cdf_float.cu +++ b/test/test_exponential_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_pdf_double.cu b/test/test_exponential_pdf_double.cu index 9a5615f1b..530b1023b 100644 --- a/test/test_exponential_pdf_double.cu +++ b/test/test_exponential_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_pdf_float.cu b/test/test_exponential_pdf_float.cu index f15ee3ea8..0801e2d0b 100644 --- a/test/test_exponential_pdf_float.cu +++ b/test/test_exponential_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_quan_double.cu b/test/test_exponential_quan_double.cu index ea5a5a681..f4eb4c3b1 100644 --- a/test/test_exponential_quan_double.cu +++ b/test/test_exponential_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_quan_float.cu b/test/test_exponential_quan_float.cu index ea5a5a681..f4eb4c3b1 100644 --- a/test/test_exponential_quan_float.cu +++ b/test/test_exponential_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_range_support_double.cu b/test/test_exponential_range_support_double.cu index eec3981d2..c19497ed5 100644 --- a/test/test_exponential_range_support_double.cu +++ b/test/test_exponential_range_support_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_exponential_range_support_float.cu b/test/test_exponential_range_support_float.cu index 00f443e52..a111090de 100644 --- a/test/test_exponential_range_support_float.cu +++ b/test/test_exponential_range_support_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_cdf_double.cu b/test/test_extreme_value_cdf_double.cu index 8f7f366b3..7ca000348 100644 --- a/test/test_extreme_value_cdf_double.cu +++ b/test/test_extreme_value_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_cdf_float.cu b/test/test_extreme_value_cdf_float.cu index d1b6cc762..bc3ead6eb 100644 --- a/test/test_extreme_value_cdf_float.cu +++ b/test/test_extreme_value_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_pdf_double.cu b/test/test_extreme_value_pdf_double.cu index 4cf3fc2d0..44ccc5b71 100644 --- a/test/test_extreme_value_pdf_double.cu +++ b/test/test_extreme_value_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_pdf_float.cu b/test/test_extreme_value_pdf_float.cu index c0c5da7ee..390622f40 100644 --- a/test/test_extreme_value_pdf_float.cu +++ b/test/test_extreme_value_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_quan_double.cu b/test/test_extreme_value_quan_double.cu index 703d2054f..41f2f69a6 100644 --- a/test/test_extreme_value_quan_double.cu +++ b/test/test_extreme_value_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_extreme_value_quan_float.cu b/test/test_extreme_value_quan_float.cu index 25d982cd0..5fe16e9a8 100644 --- a/test/test_extreme_value_quan_float.cu +++ b/test/test_extreme_value_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_fisher_f.cpp b/test/test_fisher_f.cpp index c18ed8ff1..f142a3327 100644 --- a/test/test_fisher_f.cpp +++ b/test/test_fisher_f.cpp @@ -8,9 +8,13 @@ // (See accompanying file LICENSE_1_0.txt // or copy at http://www.boost.org/LICENSE_1_0.txt) -#include +#include +#include "../include_private/boost/math/tools/test.hpp" + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept using ::boost::math::concepts::real_concept; +#endif #include // for fisher_f_distribution using boost::math::fisher_f_distribution; diff --git a/test/test_fisher_f_cdf_double.cu b/test/test_fisher_f_cdf_double.cu new file mode 100644 index 000000000..c6d6f0a94 --- /dev/null +++ b/test/test_fisher_f_cdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_cdf_float.cu b/test/test_fisher_f_cdf_float.cu new file mode 100644 index 000000000..9df1bc869 --- /dev/null +++ b/test/test_fisher_f_cdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_cdf_nvrtc_double.cpp b/test/test_fisher_f_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..1eb9cb00f --- /dev/null +++ b/test/test_fisher_f_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_fisher_f_cdf_nvrtc_float.cpp b/test/test_fisher_f_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..244190cf1 --- /dev/null +++ b/test/test_fisher_f_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_fisher_f_pdf_double.cu b/test/test_fisher_f_pdf_double.cu new file mode 100644 index 000000000..77a3b655a --- /dev/null +++ b/test/test_fisher_f_pdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_pdf_float.cu b/test/test_fisher_f_pdf_float.cu new file mode 100644 index 000000000..323edf342 --- /dev/null +++ b/test/test_fisher_f_pdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_pdf_nvrtc_double.cpp b/test/test_fisher_f_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..8aa1482aa --- /dev/null +++ b/test/test_fisher_f_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_fisher_f_pdf_nvrtc_float.cpp b/test/test_fisher_f_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..e461dea9a --- /dev/null +++ b/test/test_fisher_f_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_fisher_f_quan_double.cu b/test/test_fisher_f_quan_double.cu new file mode 100644 index 000000000..c16eb2a95 --- /dev/null +++ b/test/test_fisher_f_quan_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_quan_float.cu b/test/test_fisher_f_quan_float.cu new file mode 100644 index 000000000..85cf47967 --- /dev/null +++ b/test/test_fisher_f_quan_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::fisher_f_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_fisher_f_quan_nvrtc_double.cpp b/test/test_fisher_f_quan_nvrtc_double.cpp new file mode 100644 index 000000000..16ad0cbc0 --- /dev/null +++ b/test/test_fisher_f_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_fisher_f_quan_nvrtc_float.cpp b/test/test_fisher_f_quan_nvrtc_float.cpp new file mode 100644 index 000000000..377048e52 --- /dev/null +++ b/test/test_fisher_f_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::fisher_f_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_fisher_f_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::fisher_f_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist.cpp b/test/test_gamma_dist.cpp index b7776c79c..2b1a181f3 100644 --- a/test/test_gamma_dist.cpp +++ b/test/test_gamma_dist.cpp @@ -15,16 +15,23 @@ // From MathWorld--A Wolfram Web Resource. // http://mathworld.wolfram.com/GammaDistribution.html +#ifndef SYCL_LANGUAGE_VERSION #include // include directory libs/math/src/tr1/ is needed. +#endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept +#endif + #define BOOST_TEST_MAIN #include // Boost.Test #include #include using boost::math::gamma_distribution; -#include +#include "../include_private/boost/math/tools/test.hpp" #include "test_out_of_range.hpp" #include diff --git a/test/test_gamma_dist_cdf_double.cu b/test/test_gamma_dist_cdf_double.cu new file mode 100644 index 000000000..4777196aa --- /dev/null +++ b/test/test_gamma_dist_cdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_cdf_float.cu b/test/test_gamma_dist_cdf_float.cu new file mode 100644 index 000000000..a93aca395 --- /dev/null +++ b/test/test_gamma_dist_cdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_cdf_nvrtc_double.cpp b/test/test_gamma_dist_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..3e911f4e0 --- /dev/null +++ b/test/test_gamma_dist_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist_cdf_nvrtc_float.cpp b/test/test_gamma_dist_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..17762d406 --- /dev/null +++ b/test/test_gamma_dist_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist_pdf_double.cu b/test/test_gamma_dist_pdf_double.cu new file mode 100644 index 000000000..a8411d5b6 --- /dev/null +++ b/test/test_gamma_dist_pdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_pdf_float.cu b/test/test_gamma_dist_pdf_float.cu new file mode 100644 index 000000000..6ab3247ac --- /dev/null +++ b/test/test_gamma_dist_pdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_pdf_nvrtc_double.cpp b/test/test_gamma_dist_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..1faae9986 --- /dev/null +++ b/test/test_gamma_dist_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist_pdf_nvrtc_float.cpp b/test/test_gamma_dist_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..054ddbbad --- /dev/null +++ b/test/test_gamma_dist_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist_quan_double.cu b/test/test_gamma_dist_quan_double.cu new file mode 100644 index 000000000..d29bf6d6b --- /dev/null +++ b/test/test_gamma_dist_quan_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_quan_float.cu b/test/test_gamma_dist_quan_float.cu new file mode 100644 index 000000000..58aa42e90 --- /dev/null +++ b/test/test_gamma_dist_quan_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::gamma_distribution(1, 1), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::gamma_distribution(1, 1), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_gamma_dist_quan_nvrtc_double.cpp b/test/test_gamma_dist_quan_nvrtc_double.cpp new file mode 100644 index 000000000..132efcd6c --- /dev/null +++ b/test/test_gamma_dist_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_gamma_dist_quan_nvrtc_float.cpp b/test/test_gamma_dist_quan_nvrtc_float.cpp new file mode 100644 index 000000000..7749523ab --- /dev/null +++ b/test/test_gamma_dist_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::gamma_distribution(1, 1), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::gamma_distribution(1, 1), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric.cpp b/test/test_geometric.cpp index 928a2aa0e..13a9e090b 100644 --- a/test/test_geometric.cpp +++ b/test/test_geometric.cpp @@ -26,9 +26,14 @@ # define TEST_REAL_CONCEPT #endif -#include +#include + +#include "../include_private/boost/math/tools/test.hpp" + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept using ::boost::math::concepts::real_concept; +#endif #include // for geometric_distribution using boost::math::geometric_distribution; @@ -64,7 +69,11 @@ void test_spot( // Test a single spot value against 'known good' values. RealType tol, // Test tolerance RealType logtol) // Logcdf Test tolerance. { - BOOST_IF_CONSTEXPR (std::is_same::value || std::is_same::value) + BOOST_IF_CONSTEXPR (std::is_same::value + #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS + || std::is_same::value + #endif + ) { logtol *= 100; } @@ -376,7 +385,9 @@ if(std::numeric_limits::is_specialized) static_cast(9.9000000000003448e-201L), // 100 * tolerance); // Note difference - // p nearer unity. + // p nearer unity. + // On GPU this gets flushed to 0 which has an eps difference of 3.4e+38 + #ifndef BOOST_MATH_HAS_GPU_SUPPORT BOOST_CHECK_CLOSE_FRACTION( // pdf(geometric_distribution(static_cast(0.9999)), static_cast(10) ), // Number of failures, k @@ -384,6 +395,7 @@ if(std::numeric_limits::is_specialized) // static_cast(1.00156406e-040) static_cast(9.999e-41), // exact from 100 digit calculator. 2e3 * tolerance); // Note bigger tolerance needed. + #endif // Moshier Cephes 100 digits calculator says 9.999e-41 //0.9999*pow(1-0.9999,10) diff --git a/test/test_geometric_dist_cdf_double.cu b/test/test_geometric_dist_cdf_double.cu new file mode 100644 index 000000000..98b6510ad --- /dev/null +++ b/test/test_geometric_dist_cdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_cdf_float.cu b/test/test_geometric_dist_cdf_float.cu new file mode 100644 index 000000000..2662ac07c --- /dev/null +++ b/test/test_geometric_dist_cdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_cdf_nvrtc_double.cpp b/test/test_geometric_dist_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..f8c5ed5aa --- /dev/null +++ b/test/test_geometric_dist_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric_dist_cdf_nvrtc_float.cpp b/test/test_geometric_dist_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..a53cd0d97 --- /dev/null +++ b/test/test_geometric_dist_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric_dist_pdf_double.cu b/test/test_geometric_dist_pdf_double.cu new file mode 100644 index 000000000..03d2dc007 --- /dev/null +++ b/test/test_geometric_dist_pdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_pdf_float.cu b/test/test_geometric_dist_pdf_float.cu new file mode 100644 index 000000000..1034d122b --- /dev/null +++ b/test/test_geometric_dist_pdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_pdf_nvrtc_double.cpp b/test/test_geometric_dist_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..8a6b5756e --- /dev/null +++ b/test/test_geometric_dist_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric_dist_pdf_nvrtc_float.cpp b/test/test_geometric_dist_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..dfb05105d --- /dev/null +++ b/test/test_geometric_dist_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric_dist_quan_double.cu b/test/test_geometric_dist_quan_double.cu new file mode 100644 index 000000000..fcac938e5 --- /dev/null +++ b/test/test_geometric_dist_quan_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_quan_float.cu b/test/test_geometric_dist_quan_float.cu new file mode 100644 index 000000000..074952202 --- /dev/null +++ b/test/test_geometric_dist_quan_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::geometric_distribution(0.5), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::geometric_distribution(0.5), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_geometric_dist_quan_nvrtc_double.cpp b/test/test_geometric_dist_quan_nvrtc_double.cpp new file mode 100644 index 000000000..52b2e97ec --- /dev/null +++ b/test/test_geometric_dist_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_geometric_dist_quan_nvrtc_float.cpp b/test/test_geometric_dist_quan_nvrtc_float.cpp new file mode 100644 index 000000000..a83cf857e --- /dev/null +++ b/test/test_geometric_dist_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::geometric_distribution(0.5), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_geometric_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::geometric_distribution(0.5), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_holtsmark_cdf_double.cu b/test/test_holtsmark_cdf_double.cu index 5a02b7ddb..6b1d57041 100644 --- a/test/test_holtsmark_cdf_double.cu +++ b/test/test_holtsmark_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_holtsmark_cdf_float.cu b/test/test_holtsmark_cdf_float.cu index 71ae21f73..2a3533bac 100644 --- a/test/test_holtsmark_cdf_float.cu +++ b/test/test_holtsmark_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_holtsmark_pdf_double.cu b/test/test_holtsmark_pdf_double.cu index b883515a7..a53360d20 100644 --- a/test/test_holtsmark_pdf_double.cu +++ b/test/test_holtsmark_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_holtsmark_pdf_float.cu b/test/test_holtsmark_pdf_float.cu index c56815973..57052803f 100644 --- a/test/test_holtsmark_pdf_float.cu +++ b/test/test_holtsmark_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_inverse_chi_squared_cdf_double.cu b/test/test_inverse_chi_squared_cdf_double.cu new file mode 100644 index 000000000..9703e7a3a --- /dev/null +++ b/test/test_inverse_chi_squared_cdf_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_cdf_float.cu b/test/test_inverse_chi_squared_cdf_float.cu new file mode 100644 index 000000000..bb56a4872 --- /dev/null +++ b/test/test_inverse_chi_squared_cdf_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp b/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..b221aedaa --- /dev/null +++ b/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp b/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..743654c14 --- /dev/null +++ b/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_chi_squared_distribution.cpp b/test/test_inverse_chi_squared_distribution.cpp index a69782418..cbc9dcf19 100644 --- a/test/test_inverse_chi_squared_distribution.cpp +++ b/test/test_inverse_chi_squared_distribution.cpp @@ -14,11 +14,14 @@ // http://www.wolframalpha.com/input/?i=inverse+chisquare+distribution -#include +#include +#include "../include_private/boost/math/tools/test.hpp" + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept using ::boost::math::concepts::real_concept; +#endif -//#include #define BOOST_TEST_MAIN #include // for test_main #include // for BOOST_CHECK_CLOSE_FRACTION diff --git a/test/test_inverse_chi_squared_pdf_double.cu b/test/test_inverse_chi_squared_pdf_double.cu new file mode 100644 index 000000000..f30611749 --- /dev/null +++ b/test/test_inverse_chi_squared_pdf_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_pdf_float.cu b/test/test_inverse_chi_squared_pdf_float.cu new file mode 100644 index 000000000..8a3d1c1ef --- /dev/null +++ b/test/test_inverse_chi_squared_pdf_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp b/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..4608b3bd6 --- /dev/null +++ b/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp b/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..8b4db55c0 --- /dev/null +++ b/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_chi_squared_quan_double.cu b/test/test_inverse_chi_squared_quan_double.cu new file mode 100644 index 000000000..f9022c6a3 --- /dev/null +++ b/test/test_inverse_chi_squared_quan_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_quan_float.cu b/test/test_inverse_chi_squared_quan_float.cu new file mode 100644 index 000000000..10aa6d707 --- /dev/null +++ b/test/test_inverse_chi_squared_quan_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::inverse_chi_squared_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_chi_squared_quan_nvrtc_double.cpp b/test/test_inverse_chi_squared_quan_nvrtc_double.cpp new file mode 100644 index 000000000..0f8a9a5f8 --- /dev/null +++ b/test/test_inverse_chi_squared_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_chi_squared_quan_nvrtc_float.cpp b/test/test_inverse_chi_squared_quan_nvrtc_float.cpp new file mode 100644 index 000000000..ab494a8da --- /dev/null +++ b/test/test_inverse_chi_squared_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_chi_squared_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::inverse_chi_squared_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_cdf_double.cu b/test/test_inverse_gamma_cdf_double.cu new file mode 100644 index 000000000..4368a2284 --- /dev/null +++ b/test/test_inverse_gamma_cdf_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_cdf_float.cu b/test/test_inverse_gamma_cdf_float.cu new file mode 100644 index 000000000..cef2ec955 --- /dev/null +++ b/test/test_inverse_gamma_cdf_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_cdf_nvrtc_double.cpp b/test/test_inverse_gamma_cdf_nvrtc_double.cpp new file mode 100644 index 000000000..c5a4b9878 --- /dev/null +++ b/test/test_inverse_gamma_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_cdf_nvrtc_float.cpp b/test/test_inverse_gamma_cdf_nvrtc_float.cpp new file mode 100644 index 000000000..d76d51225 --- /dev/null +++ b/test/test_inverse_gamma_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_distribution.cpp b/test/test_inverse_gamma_distribution.cpp index 68b238fbc..436131d83 100644 --- a/test/test_inverse_gamma_distribution.cpp +++ b/test/test_inverse_gamma_distribution.cpp @@ -14,11 +14,14 @@ # pragma warning (disable : 4310) // cast truncates constant value #endif -#include +#include +#include "../include_private/boost/math/tools/test.hpp" + +#ifndef BOOST_MATH_HAS_GPU_SUPPORT #include // for real_concept using ::boost::math::concepts::real_concept; +#endif -//#include #define BOOST_TEST_MAIN #include // for test_main #include // for BOOST_CHECK_CLOSE_FRACTION diff --git a/test/test_inverse_gamma_pdf_double.cu b/test/test_inverse_gamma_pdf_double.cu new file mode 100644 index 000000000..fa5073dbe --- /dev/null +++ b/test/test_inverse_gamma_pdf_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_pdf_float.cu b/test/test_inverse_gamma_pdf_float.cu new file mode 100644 index 000000000..c2d80fe8d --- /dev/null +++ b/test/test_inverse_gamma_pdf_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_pdf_nvrtc_double.cpp b/test/test_inverse_gamma_pdf_nvrtc_double.cpp new file mode 100644 index 000000000..db2c8c4e1 --- /dev/null +++ b/test/test_inverse_gamma_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_pdf_nvrtc_float.cpp b/test/test_inverse_gamma_pdf_nvrtc_float.cpp new file mode 100644 index 000000000..4d552cf61 --- /dev/null +++ b/test/test_inverse_gamma_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_quan_double.cu b/test/test_inverse_gamma_quan_double.cu new file mode 100644 index 000000000..c9095d752 --- /dev/null +++ b/test/test_inverse_gamma_quan_double.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_quan_float.cu b/test/test_inverse_gamma_quan_float.cu new file mode 100644 index 000000000..3e60feaa1 --- /dev/null +++ b/test/test_inverse_gamma_quan_float.cu @@ -0,0 +1,110 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_gamma_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::inverse_gamma_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} diff --git a/test/test_inverse_gamma_quan_nvrtc_double.cpp b/test/test_inverse_gamma_quan_nvrtc_double.cpp new file mode 100644 index 000000000..a49600bde --- /dev/null +++ b/test/test_inverse_gamma_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_inverse_gamma_quan_nvrtc_float.cpp b/test/test_inverse_gamma_quan_nvrtc_float.cpp new file mode 100644 index 000000000..f71ed964a --- /dev/null +++ b/test/test_inverse_gamma_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::inverse_gamma_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::inverse_gamma_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_landau_cdf_double.cu b/test/test_landau_cdf_double.cu index 092fff00e..40bff707d 100644 --- a/test/test_landau_cdf_double.cu +++ b/test/test_landau_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_cdf_float.cu b/test/test_landau_cdf_float.cu index 143755aff..c4513c084 100644 --- a/test/test_landau_cdf_float.cu +++ b/test/test_landau_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_pdf_double.cu b/test/test_landau_pdf_double.cu index eea6f87ad..6ce3f5f78 100644 --- a/test/test_landau_pdf_double.cu +++ b/test/test_landau_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_pdf_float.cu b/test/test_landau_pdf_float.cu index a424bdd67..5818ddf8a 100644 --- a/test/test_landau_pdf_float.cu +++ b/test/test_landau_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_quan_double.cu b/test/test_landau_quan_double.cu index 8cdf12588..4995bd49c 100644 --- a/test/test_landau_quan_double.cu +++ b/test/test_landau_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_landau_quan_float.cu b/test/test_landau_quan_float.cu index 8cdf12588..4995bd49c 100644 --- a/test/test_landau_quan_float.cu +++ b/test/test_landau_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_cdf_double.cu b/test/test_laplace_cdf_double.cu index cddcfa2bc..ec3c83ecd 100644 --- a/test/test_laplace_cdf_double.cu +++ b/test/test_laplace_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_cdf_float.cu b/test/test_laplace_cdf_float.cu index 2af43f9f5..96acea2fd 100644 --- a/test/test_laplace_cdf_float.cu +++ b/test/test_laplace_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_pdf_double.cu b/test/test_laplace_pdf_double.cu index 2f53c0dd1..568be622b 100644 --- a/test/test_laplace_pdf_double.cu +++ b/test/test_laplace_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_pdf_float.cu b/test/test_laplace_pdf_float.cu index a8d673dba..cb2aa67c1 100644 --- a/test/test_laplace_pdf_float.cu +++ b/test/test_laplace_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_quan_double.cu b/test/test_laplace_quan_double.cu index cddcfa2bc..ec3c83ecd 100644 --- a/test/test_laplace_quan_double.cu +++ b/test/test_laplace_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_laplace_quan_float.cu b/test/test_laplace_quan_float.cu index 2af43f9f5..96acea2fd 100644 --- a/test/test_laplace_quan_float.cu +++ b/test/test_laplace_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_cdf_double.cu b/test/test_logistic_cdf_double.cu index 5dd3723c5..6b4e85025 100644 --- a/test/test_logistic_cdf_double.cu +++ b/test/test_logistic_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_cdf_float.cu b/test/test_logistic_cdf_float.cu index 89d05747b..75b6ab0af 100644 --- a/test/test_logistic_cdf_float.cu +++ b/test/test_logistic_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_pdf_double.cu b/test/test_logistic_pdf_double.cu index 39aaa1597..90232a2d6 100644 --- a/test/test_logistic_pdf_double.cu +++ b/test/test_logistic_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_pdf_float.cu b/test/test_logistic_pdf_float.cu index 279112b99..0a99ff9cf 100644 --- a/test/test_logistic_pdf_float.cu +++ b/test/test_logistic_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_quan_double.cu b/test/test_logistic_quan_double.cu index ad929d442..afe8a4c8c 100644 --- a/test/test_logistic_quan_double.cu +++ b/test/test_logistic_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_logistic_quan_float.cu b/test/test_logistic_quan_float.cu index 81c22ea4b..92c371062 100644 --- a/test/test_logistic_quan_float.cu +++ b/test/test_logistic_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_cdf_double.cu b/test/test_mapairy_cdf_double.cu index 1494181bf..7cb62a934 100644 --- a/test/test_mapairy_cdf_double.cu +++ b/test/test_mapairy_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_cdf_float.cu b/test/test_mapairy_cdf_float.cu index 41dd4615a..b67c0ee93 100644 --- a/test/test_mapairy_cdf_float.cu +++ b/test/test_mapairy_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_pdf_double.cu b/test/test_mapairy_pdf_double.cu index ad3abfbee..4ccd8b2f2 100644 --- a/test/test_mapairy_pdf_double.cu +++ b/test/test_mapairy_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_pdf_float.cu b/test/test_mapairy_pdf_float.cu index cabee4a2f..520ac9a68 100644 --- a/test/test_mapairy_pdf_float.cu +++ b/test/test_mapairy_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_quan_double.cu b/test/test_mapairy_quan_double.cu index fe6265eff..378700020 100644 --- a/test/test_mapairy_quan_double.cu +++ b/test/test_mapairy_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_mapairy_quan_float.cu b/test/test_mapairy_quan_float.cu index ad2f6b5eb..cd9d12007 100644 --- a/test/test_mapairy_quan_float.cu +++ b/test/test_mapairy_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_cdf_double.cu b/test/test_saspoint5_cdf_double.cu index 745ca2bf8..fb3e2f74c 100644 --- a/test/test_saspoint5_cdf_double.cu +++ b/test/test_saspoint5_cdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_cdf_float.cu b/test/test_saspoint5_cdf_float.cu index 51bc2e870..325a470bb 100644 --- a/test/test_saspoint5_cdf_float.cu +++ b/test/test_saspoint5_cdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_pdf_double.cu b/test/test_saspoint5_pdf_double.cu index 948a09260..5392a328b 100644 --- a/test/test_saspoint5_pdf_double.cu +++ b/test/test_saspoint5_pdf_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_pdf_float.cu b/test/test_saspoint5_pdf_float.cu index 4980e9070..01fbcd472 100644 --- a/test/test_saspoint5_pdf_float.cu +++ b/test/test_saspoint5_pdf_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_quan_double.cu b/test/test_saspoint5_quan_double.cu index 764c27899..7415f0690 100644 --- a/test/test_saspoint5_quan_double.cu +++ b/test/test_saspoint5_quan_double.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_saspoint5_quan_float.cu b/test/test_saspoint5_quan_float.cu index a65958fb8..d6f49084b 100644 --- a/test/test_saspoint5_quan_float.cu +++ b/test/test_saspoint5_quan_float.cu @@ -65,7 +65,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_cdf_double.cu b/test/test_weibull_cdf_double.cu index 65efbe252..1b2e5cf0d 100644 --- a/test/test_weibull_cdf_double.cu +++ b/test/test_weibull_cdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_cdf_float.cu b/test/test_weibull_cdf_float.cu index 65c3ce1ff..76bf3a4e1 100644 --- a/test/test_weibull_cdf_float.cu +++ b/test/test_weibull_cdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_pdf_double.cu b/test/test_weibull_pdf_double.cu index 645df4c0a..dd48b57d6 100644 --- a/test/test_weibull_pdf_double.cu +++ b/test/test_weibull_pdf_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_pdf_float.cu b/test/test_weibull_pdf_float.cu index f1e6917f0..40064b1ed 100644 --- a/test/test_weibull_pdf_float.cu +++ b/test/test_weibull_pdf_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_quan_double.cu b/test/test_weibull_quan_double.cu index 2f0500602..9263fb536 100644 --- a/test/test_weibull_quan_double.cu +++ b/test/test_weibull_quan_double.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; diff --git a/test/test_weibull_quan_float.cu b/test/test_weibull_quan_float.cu index 3027e14dd..5dd6bd6ee 100644 --- a/test/test_weibull_quan_float.cu +++ b/test/test_weibull_quan_float.cu @@ -64,7 +64,7 @@ int main(void) } // Launch the Vector Add CUDA Kernel - int threadsPerBlock = 512; + int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;