2
0
mirror of https://github.com/boostorg/math.git synced 2026-01-19 04:22:09 +00:00

Apply GPU markers to ibeta_inv_ab

Remove NVRTC workaround

Apply GPU markers to ibeta_inverse

Apply GPU markers to t_dist_inv

Fix warning suppression

Add dispatch function and remove workaround

Move disabling block

Make binomial GPU enabled

Add SYCL testing of ibeta

Add SYCL testing of ibeta_inv

Add SYCL testing of ibeta_inv_ab

Add SYCL testing of full beta suite

Add makers to fwd decls

Add special forward decls for NVRTC

Add betac nvrtc testing

Add betac CUDA testing

Add ibeta CUDA testing

Add ibeta NVRTC testing

Add ibetac NVRTC testing

Add ibeta_derviative testing to nvrtc

Add ibeta_derivative CUDA testing

Add cbrt policy overload for NVRTC

Fix NVRTC definition of BOOST_MATH_IF_CONSTEXPR

Add ibeta_inv and ibetac_inv NVRTC testing

Fix make pair helper on device

Add CUDA testing of ibeta_inv* and ibetac_inv*

Move location so that it also works on NVRTC

Add NVRTC testing of ibeta_inv* and ibetac_inv*

Fixup test sets since they ignore the policy

Make the beta dist GPU compatible

Add beta dist SYCL testing

Add beta dist CUDA testing

Add beta dist NVRTC testing
This commit is contained in:
Matt Borland
2024-08-30 13:45:53 -04:00
parent 0002de0b85
commit adf8abd346
85 changed files with 9095 additions and 240 deletions

View File

@@ -25,12 +25,15 @@
#ifndef BOOST_MATH_DIST_BETA_HPP
#define BOOST_MATH_DIST_BETA_HPP
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/tuple.hpp>
#include <boost/math/distributions/fwd.hpp>
#include <boost/math/special_functions/beta.hpp> // for beta.
#include <boost/math/distributions/complement.hpp> // complements.
#include <boost/math/distributions/detail/common_error_handling.hpp> // error checks
#include <boost/math/special_functions/fpclassify.hpp> // isnan.
#include <boost/math/tools/roots.hpp> // for root finding.
#include <boost/math/policies/error_handling.hpp>
#if defined (BOOST_MSVC)
# pragma warning(push)
@@ -38,8 +41,6 @@
// in domain_error_imp in error_handling
#endif
#include <utility>
namespace boost
{
namespace math
@@ -48,7 +49,7 @@ namespace boost
{
// Common error checking routines for beta distribution functions:
template <class RealType, class Policy>
inline bool check_alpha(const char* function, const RealType& alpha, RealType* result, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline bool check_alpha(const char* function, const RealType& alpha, RealType* result, const Policy& pol)
{
if(!(boost::math::isfinite)(alpha) || (alpha <= 0))
{
@@ -61,7 +62,7 @@ namespace boost
} // bool check_alpha
template <class RealType, class Policy>
inline bool check_beta(const char* function, const RealType& beta, RealType* result, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline bool check_beta(const char* function, const RealType& beta, RealType* result, const Policy& pol)
{
if(!(boost::math::isfinite)(beta) || (beta <= 0))
{
@@ -74,7 +75,7 @@ namespace boost
} // bool check_beta
template <class RealType, class Policy>
inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol)
{
if((p < 0) || (p > 1) || !(boost::math::isfinite)(p))
{
@@ -87,7 +88,7 @@ namespace boost
} // bool check_prob
template <class RealType, class Policy>
inline bool check_x(const char* function, const RealType& x, RealType* result, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline bool check_x(const char* function, const RealType& x, RealType* result, const Policy& pol)
{
if(!(boost::math::isfinite)(x) || (x < 0) || (x > 1))
{
@@ -100,28 +101,28 @@ namespace boost
} // bool check_x
template <class RealType, class Policy>
inline bool check_dist(const char* function, const RealType& alpha, const RealType& beta, RealType* result, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline bool check_dist(const char* function, const RealType& alpha, const RealType& beta, RealType* result, const Policy& pol)
{ // Check both alpha and beta.
return check_alpha(function, alpha, result, pol)
&& check_beta(function, beta, result, pol);
} // bool check_dist
template <class RealType, class Policy>
inline bool check_dist_and_x(const char* function, const RealType& alpha, const RealType& beta, RealType x, RealType* result, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline bool check_dist_and_x(const char* function, const RealType& alpha, const RealType& beta, RealType x, RealType* result, const Policy& pol)
{
return check_dist(function, alpha, beta, result, pol)
&& beta_detail::check_x(function, x, result, pol);
} // bool check_dist_and_x
template <class RealType, class Policy>
inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol)
{
return check_dist(function, alpha, beta, result, pol)
&& check_prob(function, p, result, pol);
} // bool check_dist_and_prob
template <class RealType, class Policy>
inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol)
{
if(!(boost::math::isfinite)(mean) || (mean <= 0))
{
@@ -133,7 +134,7 @@ namespace boost
return true;
} // bool check_mean
template <class RealType, class Policy>
inline bool check_variance(const char* function, const RealType& variance, RealType* result, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline bool check_variance(const char* function, const RealType& variance, RealType* result, const Policy& pol)
{
if(!(boost::math::isfinite)(variance) || (variance <= 0))
{
@@ -157,7 +158,7 @@ namespace boost
typedef RealType value_type;
typedef Policy policy_type;
beta_distribution(RealType l_alpha = 1, RealType l_beta = 1) : m_alpha(l_alpha), m_beta(l_beta)
BOOST_MATH_GPU_ENABLED beta_distribution(RealType l_alpha = 1, RealType l_beta = 1) : m_alpha(l_alpha), m_beta(l_beta)
{
RealType result;
beta_detail::check_dist(
@@ -167,11 +168,11 @@ namespace boost
&result, Policy());
} // beta_distribution constructor.
// Accessor functions:
RealType alpha() const
BOOST_MATH_GPU_ENABLED RealType alpha() const
{
return m_alpha;
}
RealType beta() const
BOOST_MATH_GPU_ENABLED RealType beta() const
{ // .
return m_beta;
}
@@ -183,11 +184,11 @@ namespace boost
// http://www.itl.nist.gov/div898/handbook/eda/section3/eda366h.htm
// http://www.epi.ucdavis.edu/diagnostictests/betabuster.html
static RealType find_alpha(
BOOST_MATH_GPU_ENABLED static RealType find_alpha(
RealType mean, // Expected value of mean.
RealType variance) // Expected value of variance.
{
static const char* function = "boost::math::beta_distribution<%1%>::find_alpha";
constexpr auto function = "boost::math::beta_distribution<%1%>::find_alpha";
RealType result = 0; // of error checks.
if(false ==
(
@@ -201,11 +202,11 @@ namespace boost
return mean * (( (mean * (1 - mean)) / variance)- 1);
} // RealType find_alpha
static RealType find_beta(
BOOST_MATH_GPU_ENABLED static RealType find_beta(
RealType mean, // Expected value of mean.
RealType variance) // Expected value of variance.
{
static const char* function = "boost::math::beta_distribution<%1%>::find_beta";
constexpr auto function = "boost::math::beta_distribution<%1%>::find_beta";
RealType result = 0; // of error checks.
if(false ==
(
@@ -223,12 +224,12 @@ namespace boost
// Estimate alpha & beta from either alpha or beta, and x and probability.
// Uses for these parameter estimators are unclear.
static RealType find_alpha(
BOOST_MATH_GPU_ENABLED static RealType find_alpha(
RealType beta, // from beta.
RealType x, // x.
RealType probability) // cdf
{
static const char* function = "boost::math::beta_distribution<%1%>::find_alpha";
constexpr auto function = "boost::math::beta_distribution<%1%>::find_alpha";
RealType result = 0; // of error checks.
if(false ==
(
@@ -245,13 +246,13 @@ namespace boost
return static_cast<RealType>(ibeta_inva(beta, x, probability, Policy()));
} // RealType find_alpha(beta, a, probability)
static RealType find_beta(
BOOST_MATH_GPU_ENABLED static RealType find_beta(
// ibeta_invb(T b, T x, T p); (alpha, x, cdf,)
RealType alpha, // alpha.
RealType x, // probability x.
RealType probability) // probability cdf.
{
static const char* function = "boost::math::beta_distribution<%1%>::find_beta";
constexpr auto function = "boost::math::beta_distribution<%1%>::find_beta";
RealType result = 0; // of error checks.
if(false ==
(
@@ -281,27 +282,27 @@ namespace boost
#endif
template <class RealType, class Policy>
inline const std::pair<RealType, RealType> range(const beta_distribution<RealType, Policy>& /* dist */)
BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> range(const beta_distribution<RealType, Policy>& /* dist */)
{ // Range of permissible values for random variable x.
using boost::math::tools::max_value;
return std::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
}
template <class RealType, class Policy>
inline const std::pair<RealType, RealType> support(const beta_distribution<RealType, Policy>& /* dist */)
BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> support(const beta_distribution<RealType, Policy>& /* dist */)
{ // Range of supported values for random variable x.
// This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
return std::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
}
template <class RealType, class Policy>
inline RealType mean(const beta_distribution<RealType, Policy>& dist)
BOOST_MATH_GPU_ENABLED inline RealType mean(const beta_distribution<RealType, Policy>& dist)
{ // Mean of beta distribution = np.
return dist.alpha() / (dist.alpha() + dist.beta());
} // mean
template <class RealType, class Policy>
inline RealType variance(const beta_distribution<RealType, Policy>& dist)
BOOST_MATH_GPU_ENABLED inline RealType variance(const beta_distribution<RealType, Policy>& dist)
{ // Variance of beta distribution = np(1-p).
RealType a = dist.alpha();
RealType b = dist.beta();
@@ -309,9 +310,9 @@ namespace boost
} // variance
template <class RealType, class Policy>
inline RealType mode(const beta_distribution<RealType, Policy>& dist)
BOOST_MATH_GPU_ENABLED inline RealType mode(const beta_distribution<RealType, Policy>& dist)
{
static const char* function = "boost::math::mode(beta_distribution<%1%> const&)";
constexpr auto function = "boost::math::mode(beta_distribution<%1%> const&)";
RealType result;
if ((dist.alpha() <= 1))
@@ -343,7 +344,7 @@ namespace boost
//But WILL be provided by the derived accessor as quantile(0.5).
template <class RealType, class Policy>
inline RealType skewness(const beta_distribution<RealType, Policy>& dist)
BOOST_MATH_GPU_ENABLED inline RealType skewness(const beta_distribution<RealType, Policy>& dist)
{
BOOST_MATH_STD_USING // ADL of std functions.
RealType a = dist.alpha();
@@ -352,7 +353,7 @@ namespace boost
} // skewness
template <class RealType, class Policy>
inline RealType kurtosis_excess(const beta_distribution<RealType, Policy>& dist)
BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const beta_distribution<RealType, Policy>& dist)
{
RealType a = dist.alpha();
RealType b = dist.beta();
@@ -363,17 +364,17 @@ namespace boost
} // kurtosis_excess
template <class RealType, class Policy>
inline RealType kurtosis(const beta_distribution<RealType, Policy>& dist)
BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const beta_distribution<RealType, Policy>& dist)
{
return 3 + kurtosis_excess(dist);
} // kurtosis
template <class RealType, class Policy>
inline RealType pdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
BOOST_MATH_GPU_ENABLED inline RealType pdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
{ // Probability Density/Mass Function.
BOOST_FPU_EXCEPTION_GUARD
static const char* function = "boost::math::pdf(beta_distribution<%1%> const&, %1%)";
constexpr auto function = "boost::math::pdf(beta_distribution<%1%> const&, %1%)";
BOOST_MATH_STD_USING // for ADL of std functions
@@ -428,11 +429,11 @@ namespace boost
} // pdf
template <class RealType, class Policy>
inline RealType cdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
BOOST_MATH_GPU_ENABLED inline RealType cdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
{ // Cumulative Distribution Function beta.
BOOST_MATH_STD_USING // for ADL of std functions
static const char* function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";
constexpr auto function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";
RealType a = dist.alpha();
RealType b = dist.beta();
@@ -459,12 +460,12 @@ namespace boost
} // beta cdf
template <class RealType, class Policy>
inline RealType cdf(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
{ // Complemented Cumulative Distribution Function beta.
BOOST_MATH_STD_USING // for ADL of std functions
static const char* function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";
constexpr auto function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";
RealType const& x = c.param;
beta_distribution<RealType, Policy> const& dist = c.dist;
@@ -495,7 +496,7 @@ namespace boost
} // beta cdf
template <class RealType, class Policy>
inline RealType quantile(const beta_distribution<RealType, Policy>& dist, const RealType& p)
BOOST_MATH_GPU_ENABLED inline RealType quantile(const beta_distribution<RealType, Policy>& dist, const RealType& p)
{ // Quantile or Percent Point beta function or
// Inverse Cumulative probability distribution function CDF.
// Return x (0 <= x <= 1),
@@ -505,7 +506,7 @@ namespace boost
// will be less than or equal to that value
// is whatever probability you supplied as an argument.
static const char* function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";
constexpr auto function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";
RealType result = 0; // of argument checks:
RealType a = dist.alpha();
@@ -530,12 +531,12 @@ namespace boost
} // quantile
template <class RealType, class Policy>
inline RealType quantile(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
{ // Complement Quantile or Percent Point beta function .
// Return the number of expected x for a given
// complement of the probability q.
static const char* function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";
constexpr auto function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";
//
// Error checks:

View File

@@ -12,6 +12,7 @@
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/cstdint.hpp>
#include <boost/math/tools/tuple.hpp>
#ifndef BOOST_MATH_HAS_NVRTC
@@ -877,20 +878,6 @@ BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, s
} //namespace policies
namespace detail{
//
// Simple helper function to assist in returning a pair from a single value,
// that value usually comes from one of the error handlers above:
//
template <class T>
BOOST_MATH_GPU_ENABLED std::pair<T, T> pair_from_single(const T& val) BOOST_MATH_NOEXCEPT(T)
{
return std::make_pair(val, val);
}
}
#ifdef _MSC_VER
# pragma warning(pop)
#endif
@@ -1039,7 +1026,21 @@ BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, b
} // namespace math
} // namespace boost
#endif
#endif // BOOST_MATH_HAS_NVRTC
namespace boost { namespace math { namespace detail {
//
// Simple helper function to assist in returning a pair from a single value,
// that value usually comes from one of the error handlers above:
//
template <class T>
BOOST_MATH_GPU_ENABLED boost::math::pair<T, T> pair_from_single(const T& val) BOOST_MATH_NOEXCEPT(T)
{
return boost::math::make_pair(val, val);
}
}}} // boost::math::detail
#endif // BOOST_MATH_POLICY_ERROR_HANDLING_HPP

View File

@@ -28,14 +28,10 @@
#include <boost/math/policies/policy.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/constants/constants.hpp>
#ifndef BOOST_MATH_HAS_NVRTC
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/binomial.hpp>
#include <boost/math/special_functions/factorials.hpp>
#include <boost/math/tools/roots.hpp>
#include <cmath>
#endif
namespace boost{ namespace math{
@@ -800,7 +796,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const boost::math::la
policies::check_series_iterations<T>("boost::math::ibeta<%1%>(%1%, %1%, %1%) in ibeta_series (without lanczos)", max_iter, pol);
return result;
}
#endif
//
// Continued fraction for the incomplete beta:
//
@@ -884,7 +880,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_a_step(T a, T b, T x, T y, int k, const Policy& p
return prefix;
}
#endif
//
// This function is only needed for the non-regular incomplete beta,
// it computes the delta in:
@@ -958,7 +954,6 @@ struct Pn_size<long double>
#endif
};
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
template <class T, class Policy>
BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Policy& pol, bool normalised)
{
@@ -1060,7 +1055,7 @@ BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T
}
return sum;
} // template <class T, class Lanczos>T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Lanczos& l, bool normalised)
#endif
//
// For integer arguments we can relate the incomplete beta to the
// complement of the binomial distribution cdf and use this finite sum.
@@ -1130,6 +1125,7 @@ BOOST_MATH_GPU_ENABLED T binomial_ccdf(T n, T k, T x, T y, const Policy& pol)
// input range and select the right implementation method for
// each domain:
//
template <class T, class Policy>
BOOST_MATH_GPU_ENABLED T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_derivative)
{
@@ -1749,12 +1745,7 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
} // namespace math
} // namespace boost
// TODO(mborland): Get the ibeta_inv working on NVRTC
#ifndef BOOST_MATH_HAS_NVRTC
#include <boost/math/special_functions/detail/ibeta_inverse.hpp>
#include <boost/math/special_functions/detail/ibeta_inv_ab.hpp>
#endif
#endif // BOOST_MATH_SPECIAL_BETA_HPP

View File

@@ -10,20 +10,21 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/factorials.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <type_traits>
namespace boost{ namespace math{
template <class T, class Policy>
T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
{
static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
BOOST_MATH_STD_USING
static const char* function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)";
constexpr auto function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)";
if(k > n)
return policies::raise_domain_error<T>(function, "The binomial coefficient is undefined for k > n, but got k = %1%.", static_cast<T>(k), pol);
T result; // LCOV_EXCL_LINE
@@ -43,9 +44,9 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
{
// Use the beta function:
if(k < n - k)
result = static_cast<T>(k * beta(static_cast<T>(k), static_cast<T>(n-k+1), pol));
result = static_cast<T>(k * boost::math::beta(static_cast<T>(k), static_cast<T>(n-k+1), pol));
else
result = static_cast<T>((n - k) * beta(static_cast<T>(k+1), static_cast<T>(n-k), pol));
result = static_cast<T>((n - k) * boost::math::beta(static_cast<T>(k+1), static_cast<T>(n-k), pol));
if(result == 0)
return policies::raise_overflow_error<T>(function, nullptr, pol);
result = 1 / result;
@@ -59,7 +60,7 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
// we'll promote to double:
//
template <>
inline float binomial_coefficient<float, policies::policy<> >(unsigned n, unsigned k, const policies::policy<>&)
BOOST_MATH_GPU_ENABLED inline float binomial_coefficient<float, policies::policy<> >(unsigned n, unsigned k, const policies::policy<>&)
{
typedef policies::normalise<
policies::policy<>,
@@ -71,7 +72,7 @@ inline float binomial_coefficient<float, policies::policy<> >(unsigned n, unsign
}
template <class T>
inline T binomial_coefficient(unsigned n, unsigned k)
BOOST_MATH_GPU_ENABLED inline T binomial_coefficient(unsigned n, unsigned k)
{
return binomial_coefficient<T>(n, k, policies::policy<>());
}

View File

@@ -11,15 +11,16 @@
#pragma once
#endif
#ifndef __CUDACC_RTC__
#include <boost/math/tools/config.hpp>
#ifndef BOOST_MATH_HAS_NVRTC
#include <boost/math/tools/rational.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/cstdint.hpp>
#include <boost/math/policies/error_handling.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <type_traits>
#include <cstdint>
namespace boost{ namespace math{
@@ -174,19 +175,30 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type cbrt(T z)
} // namespace math
} // namespace boost
#else
#else // Special NVRTC handling
namespace boost {
namespace math {
template <typename T>
__host__ __device__ T cbrt(T x)
BOOST_MATH_GPU_ENABLED double cbrt(T x)
{
return ::cbrt(x);
}
template <>
__host__ __device__ float cbrt(float x)
BOOST_MATH_GPU_ENABLED inline float cbrt(float x)
{
return ::cbrtf(x);
}
template <typename T, typename Policy>
BOOST_MATH_GPU_ENABLED double cbrt(T x, const Policy&)
{
return ::cbrt(x);
}
template <typename Policy>
BOOST_MATH_GPU_ENABLED float cbrt(float x, const Policy&)
{
return ::cbrtf(x);
}

View File

@@ -17,17 +17,19 @@
#pragma once
#endif
#include <cstdint>
#include <utility>
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/toms748_solve.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/tools/tuple.hpp>
#include <boost/math/policies/error_handling.hpp>
namespace boost{ namespace math{ namespace detail{
template <class T, class Policy>
struct beta_inv_ab_t
{
beta_inv_ab_t(T b_, T z_, T p_, bool invert_, bool swap_ab_) : b(b_), z(z_), p(p_), invert(invert_), swap_ab(swap_ab_) {}
T operator()(T a)
BOOST_MATH_GPU_ENABLED beta_inv_ab_t(T b_, T z_, T p_, bool invert_, bool swap_ab_) : b(b_), z(z_), p(p_), invert(invert_), swap_ab(swap_ab_) {}
BOOST_MATH_GPU_ENABLED T operator()(T a)
{
return invert ?
p - boost::math::ibetac(swap_ab ? b : a, swap_ab ? a : b, z, Policy())
@@ -39,7 +41,7 @@ private:
};
template <class T, class Policy>
T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Policy& pol)
BOOST_MATH_GPU_ENABLED T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Policy& pol)
{
BOOST_MATH_STD_USING
// mean:
@@ -72,7 +74,7 @@ T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Pol
}
template <class T, class Policy>
T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, const Policy& pol)
BOOST_MATH_GPU_ENABLED T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, const Policy& pol)
{
BOOST_MATH_STD_USING // for ADL of std lib math functions
//
@@ -121,11 +123,11 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
//
if((p < q) != swap_ab)
{
guess = (std::min)(T(b * 2), T(1));
guess = BOOST_MATH_GPU_SAFE_MIN(T(b * 2), T(1));
}
else
{
guess = (std::min)(T(b / 2), T(1));
guess = BOOST_MATH_GPU_SAFE_MIN(T(b / 2), T(1));
}
}
if(n * n * n * u * sf > 0.005)
@@ -138,11 +140,11 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
//
if((p < q) != swap_ab)
{
guess = (std::min)(T(b * 2), T(10));
guess = BOOST_MATH_GPU_SAFE_MIN(T(b * 2), T(10));
}
else
{
guess = (std::min)(T(b / 2), T(10));
guess = BOOST_MATH_GPU_SAFE_MIN(T(b / 2), T(10));
}
}
else
@@ -151,8 +153,8 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
//
// Max iterations permitted:
//
std::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
std::pair<T, T> r = bracket_and_solve_root(f, guess, factor, swap_ab ? true : false, tol, max_iter, pol);
boost::math::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
boost::math::pair<T, T> r = bracket_and_solve_root(f, guess, factor, swap_ab ? true : false, tol, max_iter, pol);
if(max_iter >= policies::get_max_root_iterations<Policy>())
return policies::raise_evaluation_error<T>("boost::math::ibeta_invab_imp<%1%>(%1%,%1%,%1%)", "Unable to locate the root within a reasonable number of iterations, closest approximation so far was %1%", r.first, pol);
return (r.first + r.second) / 2;
@@ -161,7 +163,7 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
} // namespace detail
template <class RT1, class RT2, class RT3, class Policy>
typename tools::promote_args<RT1, RT2, RT3>::type
BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
ibeta_inva(RT1 b, RT2 x, RT3 p, const Policy& pol)
{
typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -194,7 +196,7 @@ typename tools::promote_args<RT1, RT2, RT3>::type
}
template <class RT1, class RT2, class RT3, class Policy>
typename tools::promote_args<RT1, RT2, RT3>::type
BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
ibetac_inva(RT1 b, RT2 x, RT3 q, const Policy& pol)
{
typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -227,7 +229,7 @@ typename tools::promote_args<RT1, RT2, RT3>::type
}
template <class RT1, class RT2, class RT3, class Policy>
typename tools::promote_args<RT1, RT2, RT3>::type
BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
ibeta_invb(RT1 a, RT2 x, RT3 p, const Policy& pol)
{
typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -260,7 +262,7 @@ typename tools::promote_args<RT1, RT2, RT3>::type
}
template <class RT1, class RT2, class RT3, class Policy>
typename tools::promote_args<RT1, RT2, RT3>::type
BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
ibetac_invb(RT1 a, RT2 x, RT3 q, const Policy& pol)
{
constexpr auto function = "boost::math::ibeta_invb<%1%>(%1%, %1%, %1%)";
@@ -293,28 +295,28 @@ typename tools::promote_args<RT1, RT2, RT3>::type
}
template <class RT1, class RT2, class RT3>
inline typename tools::promote_args<RT1, RT2, RT3>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
ibeta_inva(RT1 b, RT2 x, RT3 p)
{
return boost::math::ibeta_inva(b, x, p, policies::policy<>());
}
template <class RT1, class RT2, class RT3>
inline typename tools::promote_args<RT1, RT2, RT3>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
ibetac_inva(RT1 b, RT2 x, RT3 q)
{
return boost::math::ibetac_inva(b, x, q, policies::policy<>());
}
template <class RT1, class RT2, class RT3>
inline typename tools::promote_args<RT1, RT2, RT3>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
ibeta_invb(RT1 a, RT2 x, RT3 p)
{
return boost::math::ibeta_invb(a, x, p, policies::policy<>());
}
template <class RT1, class RT2, class RT3>
inline typename tools::promote_args<RT1, RT2, RT3>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
ibetac_invb(RT1 a, RT2 x, RT3 q)
{
return boost::math::ibetac_invb(a, x, q, policies::policy<>());

View File

@@ -11,12 +11,14 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/precision.hpp>
#include <boost/math/tools/roots.hpp>
#include <boost/math/tools/tuple.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/erf.hpp>
#include <boost/math/tools/roots.hpp>
#include <boost/math/special_functions/detail/t_distribution_inv.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/math/tools/precision.hpp>
namespace boost{ namespace math{ namespace detail{
@@ -27,12 +29,12 @@ namespace boost{ namespace math{ namespace detail{
template <class T>
struct temme_root_finder
{
temme_root_finder(const T t_, const T a_) : t(t_), a(a_) {
BOOST_MATH_GPU_ENABLED temme_root_finder(const T t_, const T a_) : t(t_), a(a_) {
BOOST_MATH_ASSERT(
math::tools::epsilon<T>() <= a && !(boost::math::isinf)(a));
}
boost::math::tuple<T, T> operator()(T x)
BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(T x)
{
BOOST_MATH_STD_USING // ADL of std names
@@ -52,7 +54,7 @@ private:
// Section 2.
//
template <class T, class Policy>
T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol)
BOOST_MATH_GPU_ENABLED T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names
@@ -138,7 +140,7 @@ T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol)
// Section 3.
//
template <class T, class Policy>
T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy& pol)
BOOST_MATH_GPU_ENABLED T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names
@@ -315,7 +317,7 @@ T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy
// Section 4.
//
template <class T, class Policy>
T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol)
BOOST_MATH_GPU_ENABLED T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol)
{
BOOST_MATH_STD_USING // ADL of std names
@@ -420,10 +422,10 @@ T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol)
template <class T, class Policy>
struct ibeta_roots
{
ibeta_roots(T _a, T _b, T t, bool inv = false)
BOOST_MATH_GPU_ENABLED ibeta_roots(T _a, T _b, T t, bool inv = false)
: a(_a), b(_b), target(t), invert(inv) {}
boost::math::tuple<T, T, T> operator()(T x)
BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T, T> operator()(T x)
{
BOOST_MATH_STD_USING // ADL of std names
@@ -457,7 +459,7 @@ private:
};
template <class T, class Policy>
T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
BOOST_MATH_GPU_ENABLED T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
{
BOOST_MATH_STD_USING // For ADL of math functions.
@@ -487,8 +489,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
return p;
}
// Change things around so we can handle as b == 1 special case below:
std::swap(a, b);
std::swap(p, q);
BOOST_MATH_GPU_SAFE_SWAP(a, b);
BOOST_MATH_GPU_SAFE_SWAP(p, q);
invert = true;
}
//
@@ -524,8 +526,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
}
else if(b > 0.5f)
{
std::swap(a, b);
std::swap(p, q);
BOOST_MATH_GPU_SAFE_SWAP(a, b);
BOOST_MATH_GPU_SAFE_SWAP(p, q);
invert = !invert;
}
}
@@ -559,7 +561,7 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
y = -boost::math::expm1(boost::math::log1p(-q, pol) / a, pol);
}
if(invert)
std::swap(x, y);
BOOST_MATH_GPU_SAFE_SWAP(x, y);
if(py)
*py = y;
return x;
@@ -574,12 +576,12 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
//
if(p > 0.5)
{
std::swap(a, b);
std::swap(p, q);
BOOST_MATH_GPU_SAFE_SWAP(a, b);
BOOST_MATH_GPU_SAFE_SWAP(p, q);
invert = !invert;
}
T minv = (std::min)(a, b);
T maxv = (std::max)(a, b);
T minv = BOOST_MATH_GPU_SAFE_MIN(a, b);
T maxv = BOOST_MATH_GPU_SAFE_MAX(a, b);
if((sqrt(minv) > (maxv - minv)) && (minv > 5))
{
//
@@ -630,8 +632,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
//
if(a < b)
{
std::swap(a, b);
std::swap(p, q);
BOOST_MATH_GPU_SAFE_SWAP(a, b);
BOOST_MATH_GPU_SAFE_SWAP(p, q);
invert = !invert;
}
//
@@ -694,8 +696,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
}
if(fs < 0)
{
std::swap(a, b);
std::swap(p, q);
BOOST_MATH_GPU_SAFE_SWAP(a, b);
BOOST_MATH_GPU_SAFE_SWAP(p, q);
invert = !invert;
xs = 1 - xs;
}
@@ -758,9 +760,9 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
if(ps < 0)
{
std::swap(a, b);
std::swap(p, q);
std::swap(xs, xs2);
BOOST_MATH_GPU_SAFE_SWAP(a, b);
BOOST_MATH_GPU_SAFE_SWAP(p, q);
BOOST_MATH_GPU_SAFE_SWAP(xs, xs2);
invert = !invert;
}
//
@@ -823,8 +825,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
//
if(b < a)
{
std::swap(a, b);
std::swap(p, q);
BOOST_MATH_GPU_SAFE_SWAP(a, b);
BOOST_MATH_GPU_SAFE_SWAP(p, q);
invert = !invert;
}
if (a < tools::min_value<T>())
@@ -890,9 +892,9 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
//
if(x > 0.5)
{
std::swap(a, b);
std::swap(p, q);
std::swap(x, y);
BOOST_MATH_GPU_SAFE_SWAP(a, b);
BOOST_MATH_GPU_SAFE_SWAP(p, q);
BOOST_MATH_GPU_SAFE_SWAP(x, y);
invert = !invert;
T l = 1 - upper;
T u = 1 - lower;
@@ -922,8 +924,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
if(x < lower)
x = lower;
}
std::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
std::uintmax_t max_iter_used = 0;
boost::math::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
boost::math::uintmax_t max_iter_used = 0;
//
// Figure out how many digits to iterate towards:
//
@@ -946,7 +948,13 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
// Now iterate, we can use either p or q as the target here
// depending on which is smaller:
//
// Since we can't use halley_iterate on device we use newton raphson
//
#ifndef BOOST_MATH_HAS_GPU_SUPPORT
x = boost::math::tools::halley_iterate(
#else
x = boost::math::tools::newton_raphson_iterate(
#endif
boost::math::detail::ibeta_roots<T, Policy>(a, b, (p < q ? p : q), (p < q ? false : true)), x, lower, upper, digits, max_iter);
policies::check_root_iterations<T>("boost::math::ibeta<%1%>(%1%, %1%, %1%)", max_iter + max_iter_used, pol);
//
@@ -968,7 +976,7 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
} // namespace detail
template <class T1, class T2, class T3, class T4, class Policy>
inline typename tools::promote_args<T1, T2, T3, T4>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
ibeta_inv(T1 a, T2 b, T3 p, T4* py, const Policy& pol)
{
constexpr auto function = "boost::math::ibeta_inv<%1%>(%1%,%1%,%1%)";
@@ -1003,14 +1011,14 @@ inline typename tools::promote_args<T1, T2, T3, T4>::type
}
template <class T1, class T2, class T3, class T4>
inline typename tools::promote_args<T1, T2, T3, T4>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
ibeta_inv(T1 a, T2 b, T3 p, T4* py)
{
return ibeta_inv(a, b, p, py, policies::policy<>());
}
template <class T1, class T2, class T3>
inline typename tools::promote_args<T1, T2, T3>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3>::type
ibeta_inv(T1 a, T2 b, T3 p)
{
typedef typename tools::promote_args<T1, T2, T3>::type result_type;
@@ -1018,7 +1026,7 @@ inline typename tools::promote_args<T1, T2, T3>::type
}
template <class T1, class T2, class T3, class Policy>
inline typename tools::promote_args<T1, T2, T3>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3>::type
ibeta_inv(T1 a, T2 b, T3 p, const Policy& pol)
{
typedef typename tools::promote_args<T1, T2, T3>::type result_type;
@@ -1026,7 +1034,7 @@ inline typename tools::promote_args<T1, T2, T3>::type
}
template <class T1, class T2, class T3, class T4, class Policy>
inline typename tools::promote_args<T1, T2, T3, T4>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
ibetac_inv(T1 a, T2 b, T3 q, T4* py, const Policy& pol)
{
constexpr auto function = "boost::math::ibetac_inv<%1%>(%1%,%1%,%1%)";
@@ -1061,14 +1069,14 @@ inline typename tools::promote_args<T1, T2, T3, T4>::type
}
template <class T1, class T2, class T3, class T4>
inline typename tools::promote_args<T1, T2, T3, T4>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
ibetac_inv(T1 a, T2 b, T3 q, T4* py)
{
return ibetac_inv(a, b, q, py, policies::policy<>());
}
template <class RT1, class RT2, class RT3>
inline typename tools::promote_args<RT1, RT2, RT3>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
ibetac_inv(RT1 a, RT2 b, RT3 q)
{
typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -1076,7 +1084,7 @@ inline typename tools::promote_args<RT1, RT2, RT3>::type
}
template <class RT1, class RT2, class RT3, class Policy>
inline typename tools::promote_args<RT1, RT2, RT3>::type
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
ibetac_inv(RT1 a, RT2 b, RT3 q, const Policy& pol)
{
typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;

View File

@@ -11,6 +11,9 @@
#pragma once
#endif
#include <boost/math/tools/config.hpp>
#include <boost/math/tools/type_traits.hpp>
#include <boost/math/tools/numeric_limits.hpp>
#include <boost/math/special_functions/cbrt.hpp>
#include <boost/math/special_functions/round.hpp>
#include <boost/math/special_functions/trunc.hpp>
@@ -24,7 +27,7 @@ namespace boost{ namespace math{ namespace detail{
// Communications of the ACM, 13(10): 619-620, Oct., 1970.
//
template <class T, class Policy>
T inverse_students_t_hill(T ndf, T u, const Policy& pol)
BOOST_MATH_GPU_ENABLED T inverse_students_t_hill(T ndf, T u, const Policy& pol)
{
BOOST_MATH_STD_USING
BOOST_MATH_ASSERT(u <= 0.5);
@@ -74,7 +77,7 @@ T inverse_students_t_hill(T ndf, T u, const Policy& pol)
// Journal of Computational Finance, Vol 9 Issue 4, pp 37-73, Summer 2006
//
template <class T, class Policy>
T inverse_students_t_tail_series(T df, T v, const Policy& pol)
BOOST_MATH_GPU_ENABLED T inverse_students_t_tail_series(T df, T v, const Policy& pol)
{
BOOST_MATH_STD_USING
// Tail series expansion, see section 6 of Shaw's paper.
@@ -125,7 +128,7 @@ T inverse_students_t_tail_series(T df, T v, const Policy& pol)
}
template <class T, class Policy>
T inverse_students_t_body_series(T df, T u, const Policy& pol)
BOOST_MATH_GPU_ENABLED T inverse_students_t_body_series(T df, T u, const Policy& pol)
{
BOOST_MATH_STD_USING
//
@@ -204,7 +207,7 @@ T inverse_students_t_body_series(T df, T u, const Policy& pol)
}
template <class T, class Policy>
T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr)
BOOST_MATH_GPU_ENABLED T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr)
{
//
// df = number of degrees of freedom.
@@ -220,7 +223,7 @@ T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr)
if(u > v)
{
// function is symmetric, invert it:
std::swap(u, v);
BOOST_MATH_GPU_SAFE_SWAP(u, v);
invert = true;
}
if((floor(df) == df) && (df < 20))
@@ -416,7 +419,7 @@ calculate_real:
}
template <class T, class Policy>
inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol)
{
T u = p / 2;
T v = 1 - u;
@@ -427,7 +430,7 @@ inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol)
}
template <class T, class Policy>
inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::false_type*)
BOOST_MATH_GPU_ENABLED inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const boost::math::false_type*)
{
BOOST_MATH_STD_USING
//
@@ -450,12 +453,12 @@ inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::f
}
template <class T, class Policy>
T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::true_type*)
BOOST_MATH_GPU_ENABLED T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const boost::math::true_type*)
{
BOOST_MATH_STD_USING
bool invert = false;
if((df < 2) && (floor(df) != df))
return boost::math::detail::fast_students_t_quantile_imp(df, p, pol, static_cast<std::false_type*>(nullptr));
return boost::math::detail::fast_students_t_quantile_imp(df, p, pol, static_cast<boost::math::false_type*>(nullptr));
if(p > 0.5)
{
p = 1 - p;
@@ -521,7 +524,7 @@ T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::true_typ
}
template <class T, class Policy>
inline T fast_students_t_quantile(T df, T p, const Policy& pol)
BOOST_MATH_GPU_ENABLED inline T fast_students_t_quantile(T df, T p, const Policy& pol)
{
typedef typename policies::evaluation<T, Policy>::type value_type;
typedef typename policies::normalise<
@@ -531,12 +534,12 @@ inline T fast_students_t_quantile(T df, T p, const Policy& pol)
policies::discrete_quantile<>,
policies::assert_undefined<> >::type forwarding_policy;
typedef std::integral_constant<bool,
(std::numeric_limits<T>::digits <= 53)
typedef boost::math::integral_constant<bool,
(boost::math::numeric_limits<T>::digits <= 53)
&&
(std::numeric_limits<T>::is_specialized)
(boost::math::numeric_limits<T>::is_specialized)
&&
(std::numeric_limits<T>::radix == 2)
(boost::math::numeric_limits<T>::radix == 2)
> tag_type;
return policies::checked_narrowing_cast<T, forwarding_policy>(fast_students_t_quantile_imp(static_cast<value_type>(df), static_cast<value_type>(p), pol, static_cast<tag_type*>(nullptr)), "boost::math::students_t_quantile<%1%>(%1%,%1%,%1%)");
}

View File

@@ -1736,43 +1736,10 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in
// Ratios of two gamma functions:
//
template <class T, class Policy, class Lanczos>
BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l)
BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos_final(T z, T delta, const Policy& pol, const Lanczos&)
{
BOOST_MATH_STD_USING
if(z < tools::epsilon<T>())
{
//
// We get spurious numeric overflow unless we're very careful, this
// can occur either inside Lanczos::lanczos_sum(z) or in the
// final combination of terms, to avoid this, split the product up
// into 2 (or 3) parts:
//
// G(z) / G(L) = 1 / (z * G(L)) ; z < eps, L = z + delta = delta
// z * G(L) = z * G(lim) * (G(L)/G(lim)) ; lim = largest factorial
//
if(boost::math::max_factorial<T>::value < delta)
{
T ratio = tgamma_delta_ratio_imp_lanczos(delta, T(boost::math::max_factorial<T>::value - delta), pol, l);
ratio *= z;
ratio *= boost::math::unchecked_factorial<T>(boost::math::max_factorial<T>::value - 1);
return 1 / ratio;
}
else
{
#ifdef BOOST_MATH_HAS_NVRTC
if (boost::math::is_same_v<T, float>)
{
return 1 / (z * ::tgammaf(z + delta));
}
else
{
return 1 / (z * ::tgamma(z + delta));
}
#else
return 1 / (z * boost::math::tgamma(z + delta, pol));
#endif
}
}
T zgh = static_cast<T>(z + T(Lanczos::g()) - constants::half<T>());
T result;
if(z + delta == z)
@@ -1806,6 +1773,50 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Poli
result *= pow(T(constants::e<T>() / (zgh + delta)), delta);
return result;
}
template <class T, class Policy, class Lanczos>
BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l)
{
BOOST_MATH_STD_USING
if(z < tools::epsilon<T>())
{
//
// We get spurious numeric overflow unless we're very careful, this
// can occur either inside Lanczos::lanczos_sum(z) or in the
// final combination of terms, to avoid this, split the product up
// into 2 (or 3) parts:
//
// G(z) / G(L) = 1 / (z * G(L)) ; z < eps, L = z + delta = delta
// z * G(L) = z * G(lim) * (G(L)/G(lim)) ; lim = largest factorial
//
if(boost::math::max_factorial<T>::value < delta)
{
T ratio = tgamma_delta_ratio_imp_lanczos_final(T(delta), T(boost::math::max_factorial<T>::value - delta), pol, l);
ratio *= z;
ratio *= boost::math::unchecked_factorial<T>(boost::math::max_factorial<T>::value - 1);
return 1 / ratio;
}
else
{
#ifdef BOOST_MATH_HAS_NVRTC
if (boost::math::is_same_v<T, float>)
{
return 1 / (z * ::tgammaf(z + delta));
}
else
{
return 1 / (z * ::tgamma(z + delta));
}
#else
return 1 / (z * boost::math::tgamma(z + delta, pol));
#endif
}
}
return tgamma_delta_ratio_imp_lanczos_final(T(z), T(delta), pol, l);
}
//
// And again without Lanczos support this time:
//

View File

@@ -26,7 +26,19 @@
#include <boost/math/tools/config.hpp>
#ifndef BOOST_MATH_HAS_NVRTC
#ifdef BOOST_MATH_HAS_NVRTC
namespace boost {
namespace math {
template <class RT1, class RT2, class A>
BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, A>::type
beta(RT1 a, RT2 b, A arg);
} // namespace math
} // namespace boost
#else
#include <vector>
#include <complex>
@@ -154,9 +166,9 @@ namespace boost
// Binomial:
template <class T, class Policy>
T binomial_coefficient(unsigned n, unsigned k, const Policy& pol);
BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol);
template <class T>
T binomial_coefficient(unsigned n, unsigned k);
BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k);
// erf & erfc error functions.
template <class RT> // Error function.
@@ -874,19 +886,19 @@ namespace boost
BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> cos_pi(T x);
template <class T>
int fpclassify BOOST_NO_MACRO_EXPAND(T t);
BOOST_MATH_GPU_ENABLED int fpclassify BOOST_NO_MACRO_EXPAND(T t);
template <class T>
bool isfinite BOOST_NO_MACRO_EXPAND(T z);
BOOST_MATH_GPU_ENABLED bool isfinite BOOST_NO_MACRO_EXPAND(T z);
template <class T>
bool isinf BOOST_NO_MACRO_EXPAND(T t);
BOOST_MATH_GPU_ENABLED bool isinf BOOST_NO_MACRO_EXPAND(T t);
template <class T>
bool isnan BOOST_NO_MACRO_EXPAND(T t);
BOOST_MATH_GPU_ENABLED bool isnan BOOST_NO_MACRO_EXPAND(T t);
template <class T>
bool isnormal BOOST_NO_MACRO_EXPAND(T t);
BOOST_MATH_GPU_ENABLED bool isnormal BOOST_NO_MACRO_EXPAND(T t);
template<class T>
BOOST_MATH_GPU_ENABLED int signbit BOOST_NO_MACRO_EXPAND(T x);
@@ -1218,62 +1230,62 @@ namespace boost
BOOST_MATH_DETAIL_11_FUNC(Policy)\
\
template <class RT1, class RT2>\
inline boost::math::tools::promote_args_t<RT1, RT2> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2> \
beta(RT1 a, RT2 b) { return ::boost::math::beta(a, b, Policy()); }\
\
template <class RT1, class RT2, class A>\
inline boost::math::tools::promote_args_t<RT1, RT2, A> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, A> \
beta(RT1 a, RT2 b, A x){ return ::boost::math::beta(a, b, x, Policy()); }\
\
template <class RT1, class RT2, class RT3>\
inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
betac(RT1 a, RT2 b, RT3 x) { return ::boost::math::betac(a, b, x, Policy()); }\
\
template <class RT1, class RT2, class RT3>\
inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
ibeta(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibeta(a, b, x, Policy()); }\
\
template <class RT1, class RT2, class RT3>\
inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
ibetac(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibetac(a, b, x, Policy()); }\
\
template <class T1, class T2, class T3, class T4>\
inline boost::math::tools::promote_args_t<T1, T2, T3, T4> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3, T4> \
ibeta_inv(T1 a, T2 b, T3 p, T4* py){ return ::boost::math::ibeta_inv(a, b, p, py, Policy()); }\
\
template <class RT1, class RT2, class RT3>\
inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
ibeta_inv(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_inv(a, b, p, Policy()); }\
\
template <class T1, class T2, class T3, class T4>\
inline boost::math::tools::promote_args_t<T1, T2, T3, T4> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3, T4> \
ibetac_inv(T1 a, T2 b, T3 q, T4* py){ return ::boost::math::ibetac_inv(a, b, q, py, Policy()); }\
\
template <class RT1, class RT2, class RT3>\
inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
ibeta_inva(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_inva(a, b, p, Policy()); }\
\
template <class T1, class T2, class T3>\
inline boost::math::tools::promote_args_t<T1, T2, T3> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3> \
ibetac_inva(T1 a, T2 b, T3 q){ return ::boost::math::ibetac_inva(a, b, q, Policy()); }\
\
template <class RT1, class RT2, class RT3>\
inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
ibeta_invb(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_invb(a, b, p, Policy()); }\
\
template <class T1, class T2, class T3>\
inline boost::math::tools::promote_args_t<T1, T2, T3> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3> \
ibetac_invb(T1 a, T2 b, T3 q){ return ::boost::math::ibetac_invb(a, b, q, Policy()); }\
\
template <class RT1, class RT2, class RT3>\
inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
ibetac_inv(RT1 a, RT2 b, RT3 q){ return ::boost::math::ibetac_inv(a, b, q, Policy()); }\
\
template <class RT1, class RT2, class RT3>\
inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
ibeta_derivative(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibeta_derivative(a, b, x, Policy()); }\
\
template <class T> T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient<T, Policy>(n, k, Policy()); }\
template <class T> BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient<T, Policy>(n, k, Policy()); }\
\
template <class RT>\
BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> erf(RT z) { return ::boost::math::erf(z, Policy()); }\

View File

@@ -800,7 +800,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return
#define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast<T>(V)
#define BOOST_MATH_FORCEINLINE __forceinline__
#define BOOST_MATH_STD_USING
#define BOOST_MATH_IF_CONSTEXPR if constexpr
#define BOOST_MATH_IF_CONSTEXPR if
#define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point<T>::value)
#define BOOST_MATH_CONSTEXPR_TABLE_FUNCTION constexpr
#define BOOST_MATH_NO_EXCEPTIONS

View File

@@ -25,6 +25,13 @@ run test_bernoulli_pdf_float.cu ;
run test_bernoulli_range_support_double.cu ;
run test_bernoulli_range_support_float.cu ;
run test_beta_dist_cdf_double.cu ;
run test_beta_dist_cdf_float.cu ;
run test_beta_dist_pdf_double.cu ;
run test_beta_dist_pdf_float.cu ;
run test_beta_dist_quan_double.cu ;
run test_beta_dist_quan_float.cu ;
run test_cauchy_cdf_double.cu ;
run test_cauchy_cdf_float.cu ;
run test_cauchy_pdf_double.cu ;
@@ -107,6 +114,24 @@ run test_weibull_quan_float.cu ;
# Special Functions
run test_beta_double.cu ;
run test_beta_float.cu ;
run test_betac_double.cu ;
run test_betac_float.cu ;
run test_ibeta_double.cu ;
run test_ibeta_float.cu ;
run test_ibeta_derivative_double.cu ;
run test_ibeta_derivative_float.cu ;
run test_ibeta_inv_double.cu ;
run test_ibeta_inv_float.cu ;
run test_ibeta_inva_double.cu ;
run test_ibeta_inva_float.cu ;
run test_ibeta_invb_double.cu ;
run test_ibeta_invb_float.cu ;
run test_ibetac_inv_double.cu ;
run test_ibetac_inv_float.cu ;
run test_ibetac_inva_double.cu ;
run test_ibetac_inva_float.cu ;
run test_ibetac_invb_double.cu ;
run test_ibetac_invb_float.cu ;
run test_bessel_i0_double.cu ;
run test_bessel_i0_float.cu ;

View File

@@ -24,6 +24,13 @@ run test_bernoulli_pdf_nvrtc_float.cpp ;
run test_bernoulli_quan_nvrtc_double.cpp ;
run test_bernoulli_quan_nvrtc_float.cpp ;
run test_beta_dist_cdf_nvrtc_double.cpp ;
run test_beta_dist_cdf_nvrtc_float.cpp ;
run test_beta_dist_pdf_nvrtc_double.cpp ;
run test_beta_dist_pdf_nvrtc_float.cpp ;
run test_beta_dist_quan_nvrtc_double.cpp ;
run test_beta_dist_quan_nvrtc_float.cpp ;
run test_cauchy_cdf_nvrtc_double.cpp ;
run test_cauchy_cdf_nvrtc_float.cpp ;
run test_cauchy_pdf_nvrtc_double.cpp ;
@@ -104,6 +111,26 @@ run test_weibull_quan_nvrtc_float.cpp ;
# Special Functions
run test_beta_nvrtc_double.cpp ;
run test_beta_nvrtc_float.cpp ;
run test_betac_nvrtc_double.cpp ;
run test_betac_nvrtc_float.cpp ;
run test_ibeta_nvrtc_double.cpp ;
run test_ibeta_nvrtc_float.cpp ;
run test_ibetac_nvrtc_double.cpp ;
run test_ibetac_nvrtc_float.cpp ;
run test_ibeta_derivative_nvrtc_double.cpp ;
run test_ibeta_derivative_nvrtc_float.cpp ;
run test_ibeta_inv_nvrtc_double.cpp ;
run test_ibeta_inv_nvrtc_float.cpp ;
run test_ibeta_inva_nvrtc_double.cpp ;
run test_ibeta_inva_nvrtc_float.cpp ;
run test_ibeta_invb_nvrtc_double.cpp ;
run test_ibeta_invb_nvrtc_float.cpp ;
run test_ibetac_inv_nvrtc_double.cpp ;
run test_ibetac_inv_nvrtc_float.cpp ;
run test_ibetac_inva_nvrtc_double.cpp ;
run test_ibetac_inva_nvrtc_float.cpp ;
run test_ibetac_invb_nvrtc_double.cpp ;
run test_ibetac_invb_nvrtc_float.cpp ;
run test_bessel_i0_nvrtc_double.cpp ;
run test_bessel_i0_nvrtc_float.cpp ;

View File

@@ -12,6 +12,7 @@ project : requirements
# Distributions
run test_arcsine.cpp ;
run test_bernoulli.cpp ;
run test_beta_dist.cpp ;
run test_cauchy.cpp ;
run test_chi_squared.cpp ;
run test_exponential_dist.cpp ;
@@ -28,6 +29,10 @@ run test_weibull.cpp ;
run pow_test.cpp ;
run test_beta_simple.cpp ;
run test_beta.cpp ;
run test_ibeta.cpp ;
run test_ibeta_inv.cpp ;
run test_ibeta_inv_ab.cpp ;
run test_bessel_i.cpp ;
run test_bessel_j.cpp ;

View File

@@ -15,7 +15,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#include "test_bessel_i.hpp"

View File

@@ -15,7 +15,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#include "test_bessel_j.hpp"

View File

@@ -22,7 +22,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#include "test_bessel_k.hpp"

View File

@@ -15,7 +15,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#include "test_bessel_y.hpp"

View File

@@ -5,7 +5,17 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include "pch_light.hpp"
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#endif
#ifdef __clang__
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#include "test_beta.hpp"

View File

@@ -18,9 +18,10 @@
#define BOOST_TEST_MAIN
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/tools/stats.hpp>
#include <boost/math/tools/test.hpp>
#include "../include_private/boost/math/tools/test.hpp"
#include <boost/math/constants/constants.hpp>
#include <boost/type_traits/is_floating_point.hpp>
#include <boost/array.hpp>
@@ -109,9 +110,12 @@ void test_spots(T)
// Inexact input, so disable for ultra precise long doubles:
BOOST_CHECK_CLOSE(::boost::math::beta(static_cast<T>(0.0125L), static_cast<T>(0.000023L)), static_cast<T>(43558.24045647538375006349016083320744662L), tolerance * 2);
}
#ifndef BOOST_MATH_NO_EXCEPTIONS
BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(0), static_cast<T>(1)), std::domain_error);
BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(-1), static_cast<T>(1)), std::domain_error);
BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(1), static_cast<T>(-1)), std::domain_error);
BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(1), static_cast<T>(0)), std::domain_error);
#endif
}

View File

@@ -32,9 +32,14 @@
# pragma warning (disable : 4224) // nonstandard extension used : formal parameter 'arg' was previously defined as a type.
#endif
#include <boost/math/tools/config.hpp>
#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
#include <boost/math/concepts/real_concept.hpp> // for real_concept
using ::boost::math::concepts::real_concept;
#include <boost/math/tools/test.hpp>
#endif
#include "../include_private/boost/math/tools/test.hpp"
#include <boost/math/distributions/beta.hpp> // for beta_distribution
using boost::math::beta_distribution;
@@ -634,12 +639,13 @@ BOOST_AUTO_TEST_CASE( test_main )
BOOST_CHECK_CLOSE_FRACTION(mybeta22.find_alpha(mybeta22.beta(), 0.8, cdf(mybeta22, 0.8)), mybeta22.alpha(), tol);
BOOST_CHECK_CLOSE_FRACTION(mybeta22.find_beta(mybeta22.alpha(), 0.8, cdf(mybeta22, 0.8)), mybeta22.beta(), tol);
#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
beta_distribution<real_concept> rcbeta22(2, 2); // Using RealType real_concept.
cout << "numeric_limits<real_concept>::is_specialized " << numeric_limits<real_concept>::is_specialized << endl;
cout << "numeric_limits<real_concept>::digits " << numeric_limits<real_concept>::digits << endl;
cout << "numeric_limits<real_concept>::digits10 " << numeric_limits<real_concept>::digits10 << endl;
cout << "numeric_limits<real_concept>::epsilon " << numeric_limits<real_concept>::epsilon() << endl;
#endif
// (Parameter value, arbitrarily zero, only communicates the floating point type).
test_spots(0.0F); // Test float.

View File

@@ -0,0 +1,109 @@
// Copyright John Maddock 2016.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_real_distribution.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
try{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
boost::random::mt19937 gen;
boost::random::uniform_real_distribution<float_type> dist;
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = dist(gen);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 512;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(cdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,109 @@
// Copyright John Maddock 2016.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_real_distribution.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
try{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
boost::random::mt19937 gen;
boost::random::uniform_real_distribution<float_type> dist;
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = dist(gen);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 512;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(cdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,191 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/distributions/beta.hpp>
extern "C" __global__
void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
auto res = cdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
if (boost::math::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,191 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/distributions/beta.hpp>
extern "C" __global__
void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
auto res = cdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
if (boost::math::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,109 @@
// Copyright John Maddock 2016.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_real_distribution.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
try{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
boost::random::mt19937 gen;
boost::random::uniform_real_distribution<float_type> dist;
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = dist(gen);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 512;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(pdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,109 @@
// Copyright John Maddock 2016.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_real_distribution.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
try{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
boost::random::mt19937 gen;
boost::random::uniform_real_distribution<float_type> dist;
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = dist(gen);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 512;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(pdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,191 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/distributions/beta.hpp>
extern "C" __global__
void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
auto res = pdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
if (boost::math::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,191 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/distributions/beta.hpp>
extern "C" __global__
void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
auto res = pdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
if (boost::math::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,109 @@
// Copyright John Maddock 2016.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_real_distribution.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
try{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
boost::random::mt19937 gen;
boost::random::uniform_real_distribution<float_type> dist;
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = dist(gen);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 512;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(quantile(boost::math::beta_distribution<float_type>(), input_vector1[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,109 @@
// Copyright John Maddock 2016.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_real_distribution.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
try{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
boost::random::mt19937 gen;
boost::random::uniform_real_distribution<float_type> dist;
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = dist(gen);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 512;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(quantile(boost::math::beta_distribution<float_type>(), input_vector1[i]));
double t = w.elapsed();
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
return EXIT_FAILURE;
}
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,191 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/distributions/beta.hpp>
extern "C" __global__
void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
auto res = quantile(boost::math::beta_distribution<float_type>(), h_in1[i]);
if (boost::math::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,191 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/distributions/beta.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/distributions/beta.hpp>
extern "C" __global__
void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_out;
float_type *d_in1, *d_in2, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
auto res = quantile(boost::math::beta_distribution<float_type>(), h_in1[i]);
if (boost::math::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

146
test/test_betac_double.cu Normal file
View File

@@ -0,0 +1,146 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "beta_med_data.ipp"
#include "beta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < beta_med_data.size(); ++i)
{
v1.push_back(beta_med_data[i][0]);
v2.push_back(beta_med_data[i][1]);
v3.push_back(beta_med_data[i][2]);
}
for(unsigned i = 0; i < beta_small_data.size(); ++i)
{
v1.push_back(beta_small_data[i][0]);
v2.push_back(beta_small_data[i][1]);
v3.push_back(beta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::betac(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

146
test/test_betac_float.cu Normal file
View File

@@ -0,0 +1,146 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "beta_med_data.ipp"
#include "beta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < beta_med_data.size(); ++i)
{
v1.push_back(beta_med_data[i][0]);
v2.push_back(beta_med_data[i][1]);
v3.push_back(beta_med_data[i][2]);
}
for(unsigned i = 0; i < beta_small_data.size(); ++i)
{
v1.push_back(beta_small_data[i][0]);
v2.push_back(beta_small_data[i][1]);
v3.push_back(beta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::betac(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,196 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_beta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_beta_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
h_in3[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::betac(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,196 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_beta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
}
}
)";
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_beta_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function");
int numElements = 5000;
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
std::mt19937_64 rng(42);
std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
for (int i = 0; i < numElements; ++i)
{
h_in1[i] = static_cast<float_type>(dist(rng));
h_in2[i] = static_cast<float_type>(dist(rng));
h_in3[i] = static_cast<float_type>(dist(rng));
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::betac(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -13,7 +13,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error

View File

@@ -21,7 +21,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
using boost::math::holtsmark_distribution;

View File

@@ -3,7 +3,18 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#endif
#ifdef __clang__
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#include "test_ibeta.hpp"
#if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT)

View File

@@ -8,9 +8,10 @@
#define BOOST_TEST_MAIN
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/tools/stats.hpp>
#include <boost/math/tools/test.hpp>
#include "../include_private/boost/math/tools/test.hpp"
#include <boost/math/constants/constants.hpp>
#include <boost/type_traits/is_floating_point.hpp>
#include <boost/array.hpp>

View File

@@ -4,7 +4,7 @@
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#if defined(__GNUC__) && __GNUC__ <= 12
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wliteral-range"
#pragma GCC diagnostic ignored "-Woverflow"
#endif
#include <pch_light.hpp>
#include "test_ibeta_derivative.hpp"

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,207 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibeta_derivative_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_derivative_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibeta_derivative_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_derivative_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::ibeta_derivative(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,207 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibeta_derivative_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_derivative_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibeta_derivative_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_derivative_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::ibeta_derivative(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

149
test/test_ibeta_double.cu Normal file
View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibeta(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

149
test/test_ibeta_float.cu Normal file
View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibeta(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -3,7 +3,18 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#endif
#ifdef __clang__
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#include"test_ibeta_inv.hpp"
#if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT)

View File

@@ -8,10 +8,11 @@
#define BOOST_TEST_MAIN
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/next.hpp> // for has_denorm_now
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/tools/stats.hpp>
#include <boost/math/tools/test.hpp>
#include "../include_private/boost/math/tools/test.hpp"
#include <boost/math/constants/constants.hpp>
#include <boost/type_traits/is_floating_point.hpp>
#include <boost/array.hpp>
@@ -306,6 +307,7 @@ void test_spots(T)
BOOST_MATH_CHECK_THROW(::boost::math::ibeta_inv(static_cast<T>(2.125), -n, static_cast<T>(0.125)), std::domain_error);
BOOST_MATH_CHECK_THROW(::boost::math::ibeta_inv(static_cast<T>(2.125), static_cast<T>(1.125), -n), std::domain_error);
}
#ifndef SYCL_LANGUAGE_VERSION
if (boost::math::detail::has_denorm_now<T>())
{
T m = std::numeric_limits<T>::denorm_min();
@@ -317,5 +319,6 @@ void test_spots(T)
BOOST_CHECK((boost::math::isfinite)(boost::math::ibeta_inv(static_cast<T>(12.125), m, static_cast<T>(0.125))));
BOOST_CHECK((boost::math::isfinite)(boost::math::ibeta_inv(m, m, static_cast<T>(0.125))));
}
#endif
}

View File

@@ -3,7 +3,18 @@
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef SYCL_LANGUAGE_VERSION
#include <pch_light.hpp>
#endif
#ifdef __clang__
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#include "test_ibeta_inv_ab.hpp"
#if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT)

View File

@@ -10,9 +10,10 @@
#define BOOST_TEST_MAIN
#include <boost/test/unit_test.hpp>
#include <boost/test/tools/floating_point_comparison.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/math_fwd.hpp>
#include <boost/math/tools/stats.hpp>
#include <boost/math/tools/test.hpp>
#include "../include_private/boost/math/tools/test.hpp"
#include <boost/math/constants/constants.hpp>
#include <boost/type_traits/is_floating_point.hpp>
#include <boost/array.hpp>

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibeta_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibeta_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,207 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibeta_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inv_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibeta_inv_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inv_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::ibeta_inv(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,207 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibeta_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inv_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibeta_inv_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inv_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::ibeta_inv(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibeta_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibeta_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,220 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibeta_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inva_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibeta_inva_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inva_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
// Sometimes the ignore error policy is ignored so the below throws
// Rather than terminating we can continue to process through our results array
double res;
try
{
res = boost::math::ibeta_inva(h_in1[i], h_in2[i], h_in3[i]);
}
catch (...)
{
continue;
}
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,220 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibeta_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inva_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibeta_inva_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inva_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
// Sometimes the ignore error policy is ignored so the below throws
// Rather than terminating we can continue to process through our results array
double res;
try
{
res = boost::math::ibeta_inva(h_in1[i], h_in2[i], h_in3[i]);
}
catch (...)
{
continue;
}
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibeta_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibeta_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,220 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibeta_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_invb_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibeta_invb_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_invb_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
// Sometimes the ignore error policy is ignored so the below throws
// Rather than terminating we can continue to process through our results array
double res;
try
{
res = boost::math::ibeta_invb(h_in1[i], h_in2[i], h_in3[i]);
}
catch (...)
{
continue;
}
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,220 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibeta_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_invb_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibeta_invb_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_invb_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
// Sometimes the ignore error policy is ignored so the below throws
// Rather than terminating we can continue to process through our results array
double res;
try
{
res = boost::math::ibeta_invb(h_in1[i], h_in2[i], h_in3[i]);
}
catch (...)
{
continue;
}
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,207 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibeta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibeta_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::ibeta(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,207 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibeta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibeta_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::ibeta(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibetac_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibetac_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,207 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibetac_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inv_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibetac_inv_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inv_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::ibetac_inv(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,207 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibetac_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inv_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibetac_inv_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inv_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::ibetac_inv(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibetac_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef float float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibetac_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,220 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibetac_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inva_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibetac_inva_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inva_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
// Sometimes the ignore error policy is ignored so the below throws
// Rather than terminating we can continue to process through our results array
double res;
try
{
res = boost::math::ibetac_inva(h_in1[i], h_in2[i], h_in3[i]);
}
catch (...)
{
continue;
}
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,220 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibetac_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inva_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibetac_inva_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inva_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
// Sometimes the ignore error policy is ignored so the below throws
// Rather than terminating we can continue to process through our results array
double res;
try
{
res = boost::math::ibetac_inva(h_in1[i], h_in2[i], h_in3[i]);
}
catch (...)
{
continue;
}
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibetac_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,149 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// floating-point value does not fit in required floating-point type
#pragma nv_diag_suppress 221
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
#include <boost/array.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
}
}
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
/**
* Host main routine
*/
int main(void)
{
try{
// Consolidate the test data:
std::vector<float_type> v1, v2, v3;
for(unsigned i = 0; i < ibeta_data.size(); ++i)
{
v1.push_back(ibeta_data[i][0]);
v2.push_back(ibeta_data[i][1]);
v3.push_back(ibeta_data[i][2]);
}
for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
{
v1.push_back(ibeta_small_data[i][0]);
v2.push_back(ibeta_small_data[i][1]);
v3.push_back(ibeta_small_data[i][2]);
}
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
cuda_managed_ptr<float_type> input_vector2(numElements);
cuda_managed_ptr<float_type> input_vector3(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
int table_id = i % v1.size();
input_vector1[i] = v1[table_id];
input_vector2[i] = v2[table_id];
input_vector3[i] = v3[table_id];
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::ibetac_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
double t = w.elapsed();
bool failed = false;
// check the results
for(int i = 0; i < numElements; ++i)
{
if (boost::math::isfinite(output_vector[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
failed = true;
}
}
}
if (failed)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
}
return 0;
}

View File

@@ -0,0 +1,220 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibetac_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_invb_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibetac_invb_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_invb_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
// Sometimes the ignore error policy is ignored so the below throws
// Rather than terminating we can continue to process through our results array
double res;
try
{
res = boost::math::ibetac_invb(h_in1[i], h_in2[i], h_in3[i]);
}
catch (...)
{
continue;
}
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,220 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/tools/config.hpp>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibetac_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_invb_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibetac_invb_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_invb_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
// Sometimes the ignore error policy is ignored so the below throws
// Rather than terminating we can continue to process through our results array
double res;
try
{
res = boost::math::ibetac_invb(h_in1[i], h_in2[i], h_in3[i]);
}
catch (...)
{
continue;
}
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,207 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef double float_type;
const char* cuda_kernel = R"(
typedef double float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibetac_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibetac_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::ibetac(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -0,0 +1,207 @@
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
// Must be included first
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <exception>
#include <boost/math/special_functions/beta.hpp>
#include <boost/math/special_functions/relative_difference.hpp>
typedef float float_type;
const char* cuda_kernel = R"(
typedef float float_type;
#include <cuda/std/type_traits>
#include <boost/math/special_functions/beta.hpp>
extern "C" __global__
void test_ibetac_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::ibetac(in1[i], in2[i], in3[i]);
}
}
)";
template <class T> struct table_type { typedef T type; };
typedef float_type T;
#define SC_(x) static_cast<T>(x)
#include "ibeta_data.ipp"
#include "ibeta_small_data.ipp"
void checkCUDAError(cudaError_t result, const char* msg)
{
if (result != cudaSuccess)
{
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
void checkCUError(CUresult result, const char* msg)
{
if (result != CUDA_SUCCESS)
{
const char* errorStr;
cuGetErrorString(result, &errorStr);
std::cerr << msg << ": " << errorStr << std::endl;
exit(EXIT_FAILURE);
}
}
void checkNVRTCError(nvrtcResult result, const char* msg)
{
if (result != NVRTC_SUCCESS)
{
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
exit(EXIT_FAILURE);
}
}
int main()
{
try
{
// Initialize CUDA driver API
checkCUError(cuInit(0), "Failed to initialize CUDA");
// Create CUDA context
CUcontext context;
CUdevice device;
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
nvrtcProgram prog;
nvrtcResult res;
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_kernel.cu", 0, nullptr, nullptr);
checkNVRTCError(res, "Failed to create NVRTC program");
nvrtcAddNameExpression(prog, "test_ibetac_kernel");
#ifdef BOOST_MATH_NVRTC_CI_RUN
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#else
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
#endif
// Compile the program
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
if (res != NVRTC_SUCCESS)
{
size_t log_size;
nvrtcGetProgramLogSize(prog, &log_size);
char* log = new char[log_size];
nvrtcGetProgramLog(prog, log);
std::cerr << "Compilation failed:\n" << log << std::endl;
delete[] log;
exit(EXIT_FAILURE);
}
// Get PTX from the program
size_t ptx_size;
nvrtcGetPTXSize(prog, &ptx_size);
char* ptx = new char[ptx_size];
nvrtcGetPTX(prog, ptx);
// Load PTX into CUDA module
CUmodule module;
CUfunction kernel;
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_kernel"), "Failed to get kernel function");
int numElements = ibeta_data.size() + ibeta_small_data.size();
float_type *h_in1, *h_in2, *h_in3, *h_out;
float_type *d_in1, *d_in2, *d_in3, *d_out;
// Allocate memory on the host
h_in1 = new float_type[numElements];
h_in2 = new float_type[numElements];
h_in3 = new float_type[numElements];
h_out = new float_type[numElements];
// Initialize input arrays
for (int i = 0; i < ibeta_data.size(); ++i)
{
h_in1[i] = ibeta_data[i][0];
h_in2[i] = ibeta_data[i][1];
h_in3[i] = ibeta_data[i][2];
}
for (int i = 0; i < ibeta_small_data.size(); ++i)
{
h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
}
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
// Verify Result
for (int i = 0; i < numElements; ++i)
{
const auto res = boost::math::ibetac(h_in1[i], h_in2[i], h_in3[i]);
if (std::isfinite(res))
{
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
{
std::cout << "error at line: " << i
<< "\nParallel: " << h_out[i]
<< "\n Serial: " << res
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
}
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_in3);
cudaFree(d_out);
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
nvrtcDestroyProgram(&prog);
delete[] ptx;
cuCtxDestroy(context);
std::cout << "Kernel executed successfully." << std::endl;
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Stopped with exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
}

View File

@@ -12,7 +12,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#include "test_igamma.hpp"

View File

@@ -12,7 +12,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
#include "test_igamma_inv.hpp"

View File

@@ -12,7 +12,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#include "test_igamma_inva.hpp"

View File

@@ -20,7 +20,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
using boost::math::landau_distribution;

View File

@@ -9,7 +9,7 @@
# pragma clang diagnostic ignored "-Wliteral-range"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wliteral-range"
# pragma GCC diagnostic ignored "-Woverflow"
#endif
#define BOOST_TEST_MAIN