Apply GPU markers to ibeta_inv_ab

Remove NVRTC workaround Apply GPU markers to ibeta_inverse Apply GPU markers to t_dist_inv Fix warning suppression Add dispatch function and remove workaround Move disabling block Make binomial GPU enabled Add SYCL testing of ibeta Add SYCL testing of ibeta_inv Add SYCL testing of ibeta_inv_ab Add SYCL testing of full beta suite Add makers to fwd decls Add special forward decls for NVRTC Add betac nvrtc testing Add betac CUDA testing Add ibeta CUDA testing Add ibeta NVRTC testing Add ibetac NVRTC testing Add ibeta_derviative testing to nvrtc Add ibeta_derivative CUDA testing Add cbrt policy overload for NVRTC Fix NVRTC definition of BOOST_MATH_IF_CONSTEXPR Add ibeta_inv and ibetac_inv NVRTC testing Fix make pair helper on device Add CUDA testing of ibeta_inv* and ibetac_inv* Move location so that it also works on NVRTC Add NVRTC testing of ibeta_inv* and ibetac_inv* Fixup test sets since they ignore the policy Make the beta dist GPU compatible Add beta dist SYCL testing Add beta dist CUDA testing Add beta dist NVRTC testing
2026-01-19 04:22:09 +00:00 · 2024-08-30 13:45:53 -04:00
parent 0002de0b85
commit adf8abd346
85 changed files with 9095 additions and 240 deletions
--- a/include/boost/math/distributions/beta.hpp
+++ b/include/boost/math/distributions/beta.hpp
@@ -25,12 +25,15 @@
 #ifndef BOOST_MATH_DIST_BETA_HPP
 #define BOOST_MATH_DIST_BETA_HPP

+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/beta.hpp> // for beta.
 #include <boost/math/distributions/complement.hpp> // complements.
 #include <boost/math/distributions/detail/common_error_handling.hpp> // error checks
 #include <boost/math/special_functions/fpclassify.hpp> // isnan.
 #include <boost/math/tools/roots.hpp> // for root finding.
+#include <boost/math/policies/error_handling.hpp>

 #if defined (BOOST_MSVC)
 #  pragma warning(push)
@@ -38,8 +41,6 @@
 // in domain_error_imp in error_handling
 #endif

-#include <utility>
-
 namespace boost
 {
  namespace math
@@ -48,7 +49,7 @@ namespace boost
    {
      // Common error checking routines for beta distribution functions:
      template <class RealType, class Policy>
-      inline bool check_alpha(const char* function, const RealType& alpha, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_alpha(const char* function, const RealType& alpha, RealType* result, const Policy& pol)
      {
        if(!(boost::math::isfinite)(alpha) || (alpha <= 0))
        {
@@ -61,7 +62,7 @@ namespace boost
      } // bool check_alpha

      template <class RealType, class Policy>
-      inline bool check_beta(const char* function, const RealType& beta, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_beta(const char* function, const RealType& beta, RealType* result, const Policy& pol)
      {
        if(!(boost::math::isfinite)(beta) || (beta <= 0))
        {
@@ -74,7 +75,7 @@ namespace boost
      } // bool check_beta

      template <class RealType, class Policy>
-      inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol)
      {
        if((p < 0) || (p > 1) || !(boost::math::isfinite)(p))
        {
@@ -87,7 +88,7 @@ namespace boost
      } // bool check_prob

      template <class RealType, class Policy>
-      inline bool check_x(const char* function, const RealType& x, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_x(const char* function, const RealType& x, RealType* result, const Policy& pol)
      {
        if(!(boost::math::isfinite)(x) || (x < 0) || (x > 1))
        {
@@ -100,28 +101,28 @@ namespace boost
      } // bool check_x

      template <class RealType, class Policy>
-      inline bool check_dist(const char* function, const RealType& alpha, const RealType& beta, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist(const char* function, const RealType& alpha, const RealType& beta, RealType* result, const Policy& pol)
      { // Check both alpha and beta.
        return check_alpha(function, alpha, result, pol)
          && check_beta(function, beta, result, pol);
      } // bool check_dist

      template <class RealType, class Policy>
-      inline bool check_dist_and_x(const char* function, const RealType& alpha, const RealType& beta, RealType x, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist_and_x(const char* function, const RealType& alpha, const RealType& beta, RealType x, RealType* result, const Policy& pol)
      {
        return check_dist(function, alpha, beta, result, pol)
          && beta_detail::check_x(function, x, result, pol);
      } // bool check_dist_and_x

      template <class RealType, class Policy>
-      inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol)
      {
        return check_dist(function, alpha, beta, result, pol)
          && check_prob(function, p, result, pol);
      } // bool check_dist_and_prob

      template <class RealType, class Policy>
-      inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol)
      {
        if(!(boost::math::isfinite)(mean) || (mean <= 0))
        {
@@ -133,7 +134,7 @@ namespace boost
        return true;
      } // bool check_mean
      template <class RealType, class Policy>
-      inline bool check_variance(const char* function, const RealType& variance, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_variance(const char* function, const RealType& variance, RealType* result, const Policy& pol)
      {
        if(!(boost::math::isfinite)(variance) || (variance <= 0))
        {
@@ -157,7 +158,7 @@ namespace boost
      typedef RealType value_type;
      typedef Policy policy_type;

-      beta_distribution(RealType l_alpha = 1, RealType l_beta = 1) : m_alpha(l_alpha), m_beta(l_beta)
+      BOOST_MATH_GPU_ENABLED beta_distribution(RealType l_alpha = 1, RealType l_beta = 1) : m_alpha(l_alpha), m_beta(l_beta)
      {
        RealType result;
        beta_detail::check_dist(
@@ -167,11 +168,11 @@ namespace boost
          &result, Policy());
      } // beta_distribution constructor.
      // Accessor functions:
-      RealType alpha() const
+      BOOST_MATH_GPU_ENABLED RealType alpha() const
      {
        return m_alpha;
      }
-      RealType beta() const
+      BOOST_MATH_GPU_ENABLED RealType beta() const
      { // .
        return m_beta;
      }
@@ -183,11 +184,11 @@ namespace boost
      // http://www.itl.nist.gov/div898/handbook/eda/section3/eda366h.htm
      // http://www.epi.ucdavis.edu/diagnostictests/betabuster.html

-      static RealType find_alpha(
+      BOOST_MATH_GPU_ENABLED static RealType find_alpha(
        RealType mean, // Expected value of mean.
        RealType variance) // Expected value of variance.
      {
-        static const char* function = "boost::math::beta_distribution<%1%>::find_alpha";
+        constexpr auto function = "boost::math::beta_distribution<%1%>::find_alpha";
        RealType result = 0; // of error checks.
        if(false ==
            (
@@ -201,11 +202,11 @@ namespace boost
        return mean * (( (mean * (1 - mean)) / variance)- 1);
      } // RealType find_alpha

-      static RealType find_beta(
+      BOOST_MATH_GPU_ENABLED static RealType find_beta(
        RealType mean, // Expected value of mean.
        RealType variance) // Expected value of variance.
      {
-        static const char* function = "boost::math::beta_distribution<%1%>::find_beta";
+        constexpr auto function = "boost::math::beta_distribution<%1%>::find_beta";
        RealType result = 0; // of error checks.
        if(false ==
            (
@@ -223,12 +224,12 @@ namespace boost
      // Estimate alpha & beta from either alpha or beta, and x and probability.
      // Uses for these parameter estimators are unclear.

-      static RealType find_alpha(
+      BOOST_MATH_GPU_ENABLED static RealType find_alpha(
        RealType beta, // from beta.
        RealType x, //  x.
        RealType probability) // cdf
      {
-        static const char* function = "boost::math::beta_distribution<%1%>::find_alpha";
+        constexpr auto function = "boost::math::beta_distribution<%1%>::find_alpha";
        RealType result = 0; // of error checks.
        if(false ==
            (
@@ -245,13 +246,13 @@ namespace boost
        return static_cast<RealType>(ibeta_inva(beta, x, probability, Policy()));
      } // RealType find_alpha(beta, a, probability)

-      static RealType find_beta(
+      BOOST_MATH_GPU_ENABLED static RealType find_beta(
        // ibeta_invb(T b, T x, T p); (alpha, x, cdf,)
        RealType alpha, // alpha.
        RealType x, // probability x.
        RealType probability) // probability cdf.
      {
-        static const char* function = "boost::math::beta_distribution<%1%>::find_beta";
+        constexpr auto function = "boost::math::beta_distribution<%1%>::find_beta";
        RealType result = 0; // of error checks.
        if(false ==
            (
@@ -281,27 +282,27 @@ namespace boost
    #endif

    template <class RealType, class Policy>
-    inline const std::pair<RealType, RealType> range(const beta_distribution<RealType, Policy>& /* dist */)
+    BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> range(const beta_distribution<RealType, Policy>& /* dist */)
    { // Range of permissible values for random variable x.
      using boost::math::tools::max_value;
-      return std::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
+      return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
    }

    template <class RealType, class Policy>
-    inline const std::pair<RealType, RealType> support(const beta_distribution<RealType, Policy>&  /* dist */)
+    BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> support(const beta_distribution<RealType, Policy>&  /* dist */)
    { // Range of supported values for random variable x.
      // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
-      return std::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
+      return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), static_cast<RealType>(1));
    }

    template <class RealType, class Policy>
-    inline RealType mean(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType mean(const beta_distribution<RealType, Policy>& dist)
    { // Mean of beta distribution = np.
      return  dist.alpha() / (dist.alpha() + dist.beta());
    } // mean

    template <class RealType, class Policy>
-    inline RealType variance(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType variance(const beta_distribution<RealType, Policy>& dist)
    { // Variance of beta distribution = np(1-p).
      RealType a = dist.alpha();
      RealType b = dist.beta();
@@ -309,9 +310,9 @@ namespace boost
    } // variance

    template <class RealType, class Policy>
-    inline RealType mode(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType mode(const beta_distribution<RealType, Policy>& dist)
    {
-      static const char* function = "boost::math::mode(beta_distribution<%1%> const&)";
+      constexpr auto function = "boost::math::mode(beta_distribution<%1%> const&)";

      RealType result;
      if ((dist.alpha() <= 1))
@@ -343,7 +344,7 @@ namespace boost
    //But WILL be provided by the derived accessor as quantile(0.5).

    template <class RealType, class Policy>
-    inline RealType skewness(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType skewness(const beta_distribution<RealType, Policy>& dist)
    {
      BOOST_MATH_STD_USING // ADL of std functions.
      RealType a = dist.alpha();
@@ -352,7 +353,7 @@ namespace boost
    } // skewness

    template <class RealType, class Policy>
-    inline RealType kurtosis_excess(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const beta_distribution<RealType, Policy>& dist)
    {
      RealType a = dist.alpha();
      RealType b = dist.beta();
@@ -363,17 +364,17 @@ namespace boost
    } // kurtosis_excess

    template <class RealType, class Policy>
-    inline RealType kurtosis(const beta_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const beta_distribution<RealType, Policy>& dist)
    {
      return 3 + kurtosis_excess(dist);
    } // kurtosis

    template <class RealType, class Policy>
-    inline RealType pdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
+    BOOST_MATH_GPU_ENABLED inline RealType pdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
    { // Probability Density/Mass Function.
      BOOST_FPU_EXCEPTION_GUARD

-      static const char* function = "boost::math::pdf(beta_distribution<%1%> const&, %1%)";
+      constexpr auto function = "boost::math::pdf(beta_distribution<%1%> const&, %1%)";

      BOOST_MATH_STD_USING // for ADL of std functions

@@ -428,11 +429,11 @@ namespace boost
    } // pdf

    template <class RealType, class Policy>
-    inline RealType cdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
+    BOOST_MATH_GPU_ENABLED inline RealType cdf(const beta_distribution<RealType, Policy>& dist, const RealType& x)
    { // Cumulative Distribution Function beta.
      BOOST_MATH_STD_USING // for ADL of std functions

-      static const char* function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";
+      constexpr auto function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";

      RealType a = dist.alpha();
      RealType b = dist.beta();
@@ -459,12 +460,12 @@ namespace boost
    } // beta cdf

    template <class RealType, class Policy>
-    inline RealType cdf(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
+    BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
    { // Complemented Cumulative Distribution Function beta.

      BOOST_MATH_STD_USING // for ADL of std functions

-      static const char* function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";
+      constexpr auto function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)";

      RealType const& x = c.param;
      beta_distribution<RealType, Policy> const& dist = c.dist;
@@ -495,7 +496,7 @@ namespace boost
    } // beta cdf

    template <class RealType, class Policy>
-    inline RealType quantile(const beta_distribution<RealType, Policy>& dist, const RealType& p)
+    BOOST_MATH_GPU_ENABLED inline RealType quantile(const beta_distribution<RealType, Policy>& dist, const RealType& p)
    { // Quantile or Percent Point beta function or
      // Inverse Cumulative probability distribution function CDF.
      // Return x (0 <= x <= 1),
@@ -505,7 +506,7 @@ namespace boost
      // will be less than or equal to that value
      // is whatever probability you supplied as an argument.

-      static const char* function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";
+      constexpr auto function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";

      RealType result = 0; // of argument checks:
      RealType a = dist.alpha();
@@ -530,12 +531,12 @@ namespace boost
    } // quantile

    template <class RealType, class Policy>
-    inline RealType quantile(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
+    BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<beta_distribution<RealType, Policy>, RealType>& c)
    { // Complement Quantile or Percent Point beta function .
      // Return the number of expected x for a given
      // complement of the probability q.

-      static const char* function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";
+      constexpr auto function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)";

      //
      // Error checks:
--- a/include/boost/math/policies/error_handling.hpp
+++ b/include/boost/math/policies/error_handling.hpp
@@ -12,6 +12,7 @@
 #include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/tools/type_traits.hpp>
 #include <boost/math/tools/cstdint.hpp>
+#include <boost/math/tools/tuple.hpp>

 #ifndef BOOST_MATH_HAS_NVRTC

@@ -877,20 +878,6 @@ BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, s

 } //namespace policies

-namespace detail{
-
-//
-// Simple helper function to assist in returning a pair from a single value,
-// that value usually comes from one of the error handlers above:
-//
-template <class T>
-BOOST_MATH_GPU_ENABLED std::pair<T, T> pair_from_single(const T& val) BOOST_MATH_NOEXCEPT(T)
-{
-   return std::make_pair(val, val);
-}
-
-}
-
 #ifdef _MSC_VER
 #  pragma warning(pop)
 #endif
@@ -1039,7 +1026,21 @@ BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, b
 } // namespace math
 } // namespace boost

-#endif
+#endif // BOOST_MATH_HAS_NVRTC
+
+namespace boost { namespace math { namespace detail {
+
+//
+// Simple helper function to assist in returning a pair from a single value,
+// that value usually comes from one of the error handlers above:
+//
+template <class T>
+BOOST_MATH_GPU_ENABLED boost::math::pair<T, T> pair_from_single(const T& val) BOOST_MATH_NOEXCEPT(T)
+{
+   return boost::math::make_pair(val, val);
+}
+
+}}} // boost::math::detail

 #endif // BOOST_MATH_POLICY_ERROR_HANDLING_HPP

--- a/include/boost/math/special_functions/beta.hpp
+++ b/include/boost/math/special_functions/beta.hpp
@@ -28,14 +28,10 @@
 #include <boost/math/policies/policy.hpp>
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/constants/constants.hpp>
-
-#ifndef BOOST_MATH_HAS_NVRTC
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/binomial.hpp>
 #include <boost/math/special_functions/factorials.hpp>
 #include <boost/math/tools/roots.hpp>
-#include <cmath>
-#endif

 namespace boost{ namespace math{

@@ -800,7 +796,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const boost::math::la
   policies::check_series_iterations<T>("boost::math::ibeta<%1%>(%1%, %1%, %1%) in ibeta_series (without lanczos)", max_iter, pol);
   return result;
 }
-
+#endif
 //
 // Continued fraction for the incomplete beta:
 //
@@ -884,7 +880,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_a_step(T a, T b, T x, T y, int k, const Policy& p

   return prefix;
 }
-#endif
+
 //
 // This function is only needed for the non-regular incomplete beta,
 // it computes the delta in:
@@ -958,7 +954,6 @@ struct Pn_size<long double>
 #endif
 };

-#ifndef BOOST_MATH_HAS_GPU_SUPPORT
 template <class T, class Policy>
 BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Policy& pol, bool normalised)
 {
@@ -1060,7 +1055,7 @@ BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T
   }
   return sum;
 } // template <class T, class Lanczos>T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Lanczos& l, bool normalised)
-#endif
+
 //
 // For integer arguments we can relate the incomplete beta to the
 // complement of the binomial distribution cdf and use this finite sum.
@@ -1130,6 +1125,7 @@ BOOST_MATH_GPU_ENABLED T binomial_ccdf(T n, T k, T x, T y, const Policy& pol)
 // input range and select the right implementation method for
 // each domain:
 //
+
 template <class T, class Policy>
 BOOST_MATH_GPU_ENABLED T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_derivative)
 {
@@ -1749,12 +1745,7 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
 } // namespace math
 } // namespace boost

-// TODO(mborland): Get the ibeta_inv working on NVRTC
-#ifndef BOOST_MATH_HAS_NVRTC
-
 #include <boost/math/special_functions/detail/ibeta_inverse.hpp>
 #include <boost/math/special_functions/detail/ibeta_inv_ab.hpp>

-#endif
-
 #endif // BOOST_MATH_SPECIAL_BETA_HPP
--- a/include/boost/math/special_functions/binomial.hpp
+++ b/include/boost/math/special_functions/binomial.hpp
@@ -10,20 +10,21 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/factorials.hpp>
 #include <boost/math/special_functions/beta.hpp>
 #include <boost/math/policies/error_handling.hpp>
-#include <type_traits>

 namespace boost{ namespace math{

 template <class T, class Policy>
-T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
 {
-   static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
+   static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
   BOOST_MATH_STD_USING
-   static const char* function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)";
+   constexpr auto function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)";
   if(k > n)
      return policies::raise_domain_error<T>(function, "The binomial coefficient is undefined for k > n, but got k = %1%.", static_cast<T>(k), pol);
   T result;  // LCOV_EXCL_LINE
@@ -43,9 +44,9 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
   {
      // Use the beta function:
      if(k < n - k)
-         result = static_cast<T>(k * beta(static_cast<T>(k), static_cast<T>(n-k+1), pol));
+         result = static_cast<T>(k * boost::math::beta(static_cast<T>(k), static_cast<T>(n-k+1), pol));
      else
-         result = static_cast<T>((n - k) * beta(static_cast<T>(k+1), static_cast<T>(n-k), pol));
+         result = static_cast<T>((n - k) * boost::math::beta(static_cast<T>(k+1), static_cast<T>(n-k), pol));
      if(result == 0)
         return policies::raise_overflow_error<T>(function, nullptr, pol);
      result = 1 / result;
@@ -59,7 +60,7 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
 // we'll promote to double:
 //
 template <>
-inline float binomial_coefficient<float, policies::policy<> >(unsigned n, unsigned k, const policies::policy<>&)
+BOOST_MATH_GPU_ENABLED inline float binomial_coefficient<float, policies::policy<> >(unsigned n, unsigned k, const policies::policy<>&)
 {
   typedef policies::normalise<
       policies::policy<>,
@@ -71,7 +72,7 @@ inline float binomial_coefficient<float, policies::policy<> >(unsigned n, unsign
 }

 template <class T>
-inline T binomial_coefficient(unsigned n, unsigned k)
+BOOST_MATH_GPU_ENABLED inline T binomial_coefficient(unsigned n, unsigned k)
 {
   return binomial_coefficient<T>(n, k, policies::policy<>());
 }
--- a/include/boost/math/special_functions/cbrt.hpp
+++ b/include/boost/math/special_functions/cbrt.hpp
@@ -11,15 +11,16 @@
 #pragma once
 #endif

-#ifndef __CUDACC_RTC__
-
 #include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+
 #include <boost/math/tools/rational.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/cstdint.hpp>
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>
-#include <type_traits>
-#include <cstdint>

 namespace boost{ namespace math{

@@ -174,19 +175,30 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type cbrt(T z)
 } // namespace math
 } // namespace boost

-#else
+#else // Special NVRTC handling

 namespace boost {
 namespace math {

 template <typename T>
-__host__ __device__ T cbrt(T x)
+BOOST_MATH_GPU_ENABLED double cbrt(T x)
 {
   return ::cbrt(x);
 }

-template <>
-__host__ __device__ float cbrt(float x)
+BOOST_MATH_GPU_ENABLED inline float cbrt(float x)
+{
+   return ::cbrtf(x);
+}
+
+template <typename T, typename Policy>
+BOOST_MATH_GPU_ENABLED double cbrt(T x, const Policy&)
+{
+   return ::cbrt(x);
+}
+
+template <typename Policy>
+BOOST_MATH_GPU_ENABLED float cbrt(float x, const Policy&)
 {
   return ::cbrtf(x);
 }
--- a/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp
+++ b/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp
@@ -17,17 +17,19 @@
 #pragma once
 #endif

-#include <cstdint>
-#include <utility>
+#include <boost/math/tools/config.hpp>
 #include <boost/math/tools/toms748_solve.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/policies/error_handling.hpp>

 namespace boost{ namespace math{ namespace detail{

 template <class T, class Policy>
 struct beta_inv_ab_t
 {
-   beta_inv_ab_t(T b_, T z_, T p_, bool invert_, bool swap_ab_) : b(b_), z(z_), p(p_), invert(invert_), swap_ab(swap_ab_) {}
-   T operator()(T a)
+   BOOST_MATH_GPU_ENABLED beta_inv_ab_t(T b_, T z_, T p_, bool invert_, bool swap_ab_) : b(b_), z(z_), p(p_), invert(invert_), swap_ab(swap_ab_) {}
+   BOOST_MATH_GPU_ENABLED T operator()(T a)
   {
      return invert ? 
         p - boost::math::ibetac(swap_ab ? b : a, swap_ab ? a : b, z, Policy()) 
@@ -39,7 +41,7 @@ private:
 };

 template <class T, class Policy>
-T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   // mean:
@@ -72,7 +74,7 @@ T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Pol
 }

 template <class T, class Policy>
-T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, const Policy& pol)
 {
   BOOST_MATH_STD_USING  // for ADL of std lib math functions
   //
@@ -121,11 +123,11 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
      //
      if((p < q) != swap_ab)
      {
-         guess = (std::min)(T(b * 2), T(1));
+         guess = BOOST_MATH_GPU_SAFE_MIN(T(b * 2), T(1));
      }
      else
      {
-         guess = (std::min)(T(b / 2), T(1));
+         guess = BOOST_MATH_GPU_SAFE_MIN(T(b / 2), T(1));
      }
   }
   if(n * n * n * u * sf > 0.005)
@@ -138,11 +140,11 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
      //
      if((p < q) != swap_ab)
      {
-         guess = (std::min)(T(b * 2), T(10));
+         guess = BOOST_MATH_GPU_SAFE_MIN(T(b * 2), T(10));
      }
      else
      {
-         guess = (std::min)(T(b / 2), T(10));
+         guess = BOOST_MATH_GPU_SAFE_MIN(T(b / 2), T(10));
      }
   }
   else
@@ -151,8 +153,8 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
   //
   // Max iterations permitted:
   //
-   std::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
-   std::pair<T, T> r = bracket_and_solve_root(f, guess, factor, swap_ab ? true : false, tol, max_iter, pol);
+   boost::math::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
+   boost::math::pair<T, T> r = bracket_and_solve_root(f, guess, factor, swap_ab ? true : false, tol, max_iter, pol);
   if(max_iter >= policies::get_max_root_iterations<Policy>())
      return policies::raise_evaluation_error<T>("boost::math::ibeta_invab_imp<%1%>(%1%,%1%,%1%)", "Unable to locate the root within a reasonable number of iterations, closest approximation so far was %1%", r.first, pol);
   return (r.first + r.second) / 2;
@@ -161,7 +163,7 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab,
 } // namespace detail

 template <class RT1, class RT2, class RT3, class Policy>
-typename tools::promote_args<RT1, RT2, RT3>::type 
+BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type 
      ibeta_inva(RT1 b, RT2 x, RT3 p, const Policy& pol)
 {
   typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -194,7 +196,7 @@ typename tools::promote_args<RT1, RT2, RT3>::type
 }

 template <class RT1, class RT2, class RT3, class Policy>
-typename tools::promote_args<RT1, RT2, RT3>::type
+BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
      ibetac_inva(RT1 b, RT2 x, RT3 q, const Policy& pol)
 {
   typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -227,7 +229,7 @@ typename tools::promote_args<RT1, RT2, RT3>::type
 }

 template <class RT1, class RT2, class RT3, class Policy>
-typename tools::promote_args<RT1, RT2, RT3>::type
+BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
      ibeta_invb(RT1 a, RT2 x, RT3 p, const Policy& pol)
 {
   typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -260,7 +262,7 @@ typename tools::promote_args<RT1, RT2, RT3>::type
 }

 template <class RT1, class RT2, class RT3, class Policy>
-typename tools::promote_args<RT1, RT2, RT3>::type
+BOOST_MATH_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
      ibetac_invb(RT1 a, RT2 x, RT3 q, const Policy& pol)
 {
   constexpr auto function = "boost::math::ibeta_invb<%1%>(%1%, %1%, %1%)";
@@ -293,28 +295,28 @@ typename tools::promote_args<RT1, RT2, RT3>::type
 }

 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type 
         ibeta_inva(RT1 b, RT2 x, RT3 p)
 {
   return boost::math::ibeta_inva(b, x, p, policies::policy<>());
 }

 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type 
         ibetac_inva(RT1 b, RT2 x, RT3 q)
 {
   return boost::math::ibetac_inva(b, x, q, policies::policy<>());
 }

 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type 
         ibeta_invb(RT1 a, RT2 x, RT3 p)
 {
   return boost::math::ibeta_invb(a, x, p, policies::policy<>());
 }

 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type 
         ibetac_invb(RT1 a, RT2 x, RT3 q)
 {
   return boost::math::ibetac_invb(a, x, q, policies::policy<>());
--- a/include/boost/math/special_functions/detail/ibeta_inverse.hpp
+++ b/include/boost/math/special_functions/detail/ibeta_inverse.hpp
@@ -11,12 +11,14 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/roots.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/special_functions/beta.hpp>
 #include <boost/math/special_functions/erf.hpp>
-#include <boost/math/tools/roots.hpp>
 #include <boost/math/special_functions/detail/t_distribution_inv.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>
-#include <boost/math/tools/precision.hpp>

 namespace boost{ namespace math{ namespace detail{

@@ -27,12 +29,12 @@ namespace boost{ namespace math{ namespace detail{
 template <class T>
 struct temme_root_finder
 {
-   temme_root_finder(const T t_, const T a_) : t(t_), a(a_) {
+   BOOST_MATH_GPU_ENABLED temme_root_finder(const T t_, const T a_) : t(t_), a(a_) {
      BOOST_MATH_ASSERT(
         math::tools::epsilon<T>() <= a && !(boost::math::isinf)(a));
   }

-   boost::math::tuple<T, T> operator()(T x)
+   BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(T x)
   {
      BOOST_MATH_STD_USING // ADL of std names

@@ -52,7 +54,7 @@ private:
 // Section 2.
 //
 template <class T, class Policy>
-T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol)
 {
   BOOST_MATH_STD_USING // ADL of std names

@@ -138,7 +140,7 @@ T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol)
 // Section 3.
 //
 template <class T, class Policy>
-T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy& pol)
 {
   BOOST_MATH_STD_USING // ADL of std names

@@ -315,7 +317,7 @@ T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy
 // Section 4.
 //
 template <class T, class Policy>
-T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol)
 {
   BOOST_MATH_STD_USING // ADL of std names

@@ -420,10 +422,10 @@ T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol)
 template <class T, class Policy>
 struct ibeta_roots
 {
-   ibeta_roots(T _a, T _b, T t, bool inv = false)
+   BOOST_MATH_GPU_ENABLED ibeta_roots(T _a, T _b, T t, bool inv = false)
      : a(_a), b(_b), target(t), invert(inv) {}

-   boost::math::tuple<T, T, T> operator()(T x)
+   BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T, T> operator()(T x)
   {
      BOOST_MATH_STD_USING // ADL of std names

@@ -457,7 +459,7 @@ private:
 };

 template <class T, class Policy>
-T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
+BOOST_MATH_GPU_ENABLED T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
 {
   BOOST_MATH_STD_USING  // For ADL of math functions.

@@ -487,8 +489,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
         return p;
      }
      // Change things around so we can handle as b == 1 special case below:
-      std::swap(a, b);
-      std::swap(p, q);
+      BOOST_MATH_GPU_SAFE_SWAP(a, b);
+      BOOST_MATH_GPU_SAFE_SWAP(p, q);
      invert = true;
   }
   //
@@ -524,8 +526,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
      }
      else if(b > 0.5f)
      {
-         std::swap(a, b);
-         std::swap(p, q);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(p, q);
         invert = !invert;
      }
   }
@@ -559,7 +561,7 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
         y = -boost::math::expm1(boost::math::log1p(-q, pol) / a, pol);
      }
      if(invert)
-         std::swap(x, y);
+         BOOST_MATH_GPU_SAFE_SWAP(x, y);
      if(py)
         *py = y;
      return x;
@@ -574,12 +576,12 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
      //
      if(p > 0.5)
      {
-         std::swap(a, b);
-         std::swap(p, q);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(p, q);
         invert = !invert;
      }
-      T minv = (std::min)(a, b);
-      T maxv = (std::max)(a, b);
+      T minv = BOOST_MATH_GPU_SAFE_MIN(a, b);
+      T maxv = BOOST_MATH_GPU_SAFE_MAX(a, b);
      if((sqrt(minv) > (maxv - minv)) && (minv > 5))
      {
         //
@@ -630,8 +632,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
            //
            if(a < b)
            {
-               std::swap(a, b);
-               std::swap(p, q);
+               BOOST_MATH_GPU_SAFE_SWAP(a, b);
+               BOOST_MATH_GPU_SAFE_SWAP(p, q);
               invert = !invert;
            }
            //
@@ -694,8 +696,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
      }
      if(fs < 0)
      {
-         std::swap(a, b);
-         std::swap(p, q);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(p, q);
         invert = !invert;
         xs = 1 - xs;
      }
@@ -758,9 +760,9 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)

      if(ps < 0)
      {
-         std::swap(a, b);
-         std::swap(p, q);
-         std::swap(xs, xs2);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(p, q);
+         BOOST_MATH_GPU_SAFE_SWAP(xs, xs2);
         invert = !invert;
      }
      //
@@ -823,8 +825,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
      //
      if(b < a)
      {
-         std::swap(a, b);
-         std::swap(p, q);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(p, q);
         invert = !invert;
      }
      if (a < tools::min_value<T>())
@@ -890,9 +892,9 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
   //
   if(x > 0.5)
   {
-      std::swap(a, b);
-      std::swap(p, q);
-      std::swap(x, y);
+      BOOST_MATH_GPU_SAFE_SWAP(a, b);
+      BOOST_MATH_GPU_SAFE_SWAP(p, q);
+      BOOST_MATH_GPU_SAFE_SWAP(x, y);
      invert = !invert;
      T l = 1 - upper;
      T u = 1 - lower;
@@ -922,8 +924,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
      if(x < lower)
         x = lower;
   }
-   std::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
-   std::uintmax_t max_iter_used = 0;
+   boost::math::uintmax_t max_iter = policies::get_max_root_iterations<Policy>();
+   boost::math::uintmax_t max_iter_used = 0;
   //
   // Figure out how many digits to iterate towards:
   //
@@ -946,7 +948,13 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
   // Now iterate, we can use either p or q as the target here
   // depending on which is smaller:
   //
+   // Since we can't use halley_iterate on device we use newton raphson
+   //
+   #ifndef BOOST_MATH_HAS_GPU_SUPPORT
   x = boost::math::tools::halley_iterate(
+   #else
+   x = boost::math::tools::newton_raphson_iterate(
+   #endif
      boost::math::detail::ibeta_roots<T, Policy>(a, b, (p < q ? p : q), (p < q ? false : true)), x, lower, upper, digits, max_iter);
   policies::check_root_iterations<T>("boost::math::ibeta<%1%>(%1%, %1%, %1%)", max_iter + max_iter_used, pol);
   //
@@ -968,7 +976,7 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
 } // namespace detail

 template <class T1, class T2, class T3, class T4, class Policy>
-inline typename tools::promote_args<T1, T2, T3, T4>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
   ibeta_inv(T1 a, T2 b, T3 p, T4* py, const Policy& pol)
 {
   constexpr auto function = "boost::math::ibeta_inv<%1%>(%1%,%1%,%1%)";
@@ -1003,14 +1011,14 @@ inline typename tools::promote_args<T1, T2, T3, T4>::type
 }

 template <class T1, class T2, class T3, class T4>
-inline typename tools::promote_args<T1, T2, T3, T4>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
   ibeta_inv(T1 a, T2 b, T3 p, T4* py)
 {
   return ibeta_inv(a, b, p, py, policies::policy<>());
 }

 template <class T1, class T2, class T3>
-inline typename tools::promote_args<T1, T2, T3>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3>::type
   ibeta_inv(T1 a, T2 b, T3 p)
 {
   typedef typename tools::promote_args<T1, T2, T3>::type result_type;
@@ -1018,7 +1026,7 @@ inline typename tools::promote_args<T1, T2, T3>::type
 }

 template <class T1, class T2, class T3, class Policy>
-inline typename tools::promote_args<T1, T2, T3>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3>::type
   ibeta_inv(T1 a, T2 b, T3 p, const Policy& pol)
 {
   typedef typename tools::promote_args<T1, T2, T3>::type result_type;
@@ -1026,7 +1034,7 @@ inline typename tools::promote_args<T1, T2, T3>::type
 }

 template <class T1, class T2, class T3, class T4, class Policy>
-inline typename tools::promote_args<T1, T2, T3, T4>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
   ibetac_inv(T1 a, T2 b, T3 q, T4* py, const Policy& pol)
 {
   constexpr auto function = "boost::math::ibetac_inv<%1%>(%1%,%1%,%1%)";
@@ -1061,14 +1069,14 @@ inline typename tools::promote_args<T1, T2, T3, T4>::type
 }

 template <class T1, class T2, class T3, class T4>
-inline typename tools::promote_args<T1, T2, T3, T4>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3, T4>::type
   ibetac_inv(T1 a, T2 b, T3 q, T4* py)
 {
   return ibetac_inv(a, b, q, py, policies::policy<>());
 }

 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
   ibetac_inv(RT1 a, RT2 b, RT3 q)
 {
   typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
@@ -1076,7 +1084,7 @@ inline typename tools::promote_args<RT1, RT2, RT3>::type
 }

 template <class RT1, class RT2, class RT3, class Policy>
-inline typename tools::promote_args<RT1, RT2, RT3>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, RT3>::type
   ibetac_inv(RT1 a, RT2 b, RT3 q, const Policy& pol)
 {
   typedef typename tools::promote_args<RT1, RT2, RT3>::type result_type;
--- a/include/boost/math/special_functions/detail/t_distribution_inv.hpp
+++ b/include/boost/math/special_functions/detail/t_distribution_inv.hpp
@@ -11,6 +11,9 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/special_functions/cbrt.hpp>
 #include <boost/math/special_functions/round.hpp>
 #include <boost/math/special_functions/trunc.hpp>
@@ -24,7 +27,7 @@ namespace boost{ namespace math{ namespace detail{
 // Communications of the ACM, 13(10): 619-620, Oct., 1970.
 //
 template <class T, class Policy>
-T inverse_students_t_hill(T ndf, T u, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T inverse_students_t_hill(T ndf, T u, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   BOOST_MATH_ASSERT(u <= 0.5);
@@ -74,7 +77,7 @@ T inverse_students_t_hill(T ndf, T u, const Policy& pol)
 // Journal of Computational Finance, Vol 9 Issue 4, pp 37-73, Summer 2006
 //
 template <class T, class Policy>
-T inverse_students_t_tail_series(T df, T v, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T inverse_students_t_tail_series(T df, T v, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   // Tail series expansion, see section 6 of Shaw's paper.
@@ -125,7 +128,7 @@ T inverse_students_t_tail_series(T df, T v, const Policy& pol)
 }

 template <class T, class Policy>
-T inverse_students_t_body_series(T df, T u, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T inverse_students_t_body_series(T df, T u, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   //
@@ -204,7 +207,7 @@ T inverse_students_t_body_series(T df, T u, const Policy& pol)
 }

 template <class T, class Policy>
-T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr)
+BOOST_MATH_GPU_ENABLED T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr)
 {
   //
   // df = number of degrees of freedom.
@@ -220,7 +223,7 @@ T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr)
   if(u > v)
   {
      // function is symmetric, invert it:
-      std::swap(u, v);
+      BOOST_MATH_GPU_SAFE_SWAP(u, v);
      invert = true;
   }
   if((floor(df) == df) && (df < 20))
@@ -416,7 +419,7 @@ calculate_real:
 }

 template <class T, class Policy>
-inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol)
 {
   T u = p / 2;
   T v = 1 - u;
@@ -427,7 +430,7 @@ inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol)
 }

 template <class T, class Policy>
-inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::false_type*)
+BOOST_MATH_GPU_ENABLED inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const boost::math::false_type*)
 {
   BOOST_MATH_STD_USING
   //
@@ -450,12 +453,12 @@ inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::f
 }

 template <class T, class Policy>
-T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::true_type*)
+BOOST_MATH_GPU_ENABLED T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const boost::math::true_type*)
 {
   BOOST_MATH_STD_USING
   bool invert = false;
   if((df < 2) && (floor(df) != df))
-      return boost::math::detail::fast_students_t_quantile_imp(df, p, pol, static_cast<std::false_type*>(nullptr));
+      return boost::math::detail::fast_students_t_quantile_imp(df, p, pol, static_cast<boost::math::false_type*>(nullptr));
   if(p > 0.5)
   {
      p = 1 - p;
@@ -521,7 +524,7 @@ T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::true_typ
 }

 template <class T, class Policy>
-inline T fast_students_t_quantile(T df, T p, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T fast_students_t_quantile(T df, T p, const Policy& pol)
 {
   typedef typename policies::evaluation<T, Policy>::type value_type;
   typedef typename policies::normalise<
@@ -531,12 +534,12 @@ inline T fast_students_t_quantile(T df, T p, const Policy& pol)
      policies::discrete_quantile<>,
      policies::assert_undefined<> >::type forwarding_policy;

-   typedef std::integral_constant<bool,
-      (std::numeric_limits<T>::digits <= 53)
+   typedef boost::math::integral_constant<bool,
+      (boost::math::numeric_limits<T>::digits <= 53)
       &&
-      (std::numeric_limits<T>::is_specialized)
+      (boost::math::numeric_limits<T>::is_specialized)
       &&
-      (std::numeric_limits<T>::radix == 2)
+      (boost::math::numeric_limits<T>::radix == 2)
   > tag_type;
   return policies::checked_narrowing_cast<T, forwarding_policy>(fast_students_t_quantile_imp(static_cast<value_type>(df), static_cast<value_type>(p), pol, static_cast<tag_type*>(nullptr)), "boost::math::students_t_quantile<%1%>(%1%,%1%,%1%)");
 }
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -1736,43 +1736,10 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in
 // Ratios of two gamma functions:
 //
 template <class T, class Policy, class Lanczos>
-BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l)
+BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos_final(T z, T delta, const Policy& pol, const Lanczos&)
 {
   BOOST_MATH_STD_USING
-   if(z < tools::epsilon<T>())
-   {
-      //
-      // We get spurious numeric overflow unless we're very careful, this
-      // can occur either inside Lanczos::lanczos_sum(z) or in the
-      // final combination of terms, to avoid this, split the product up
-      // into 2 (or 3) parts:
-      //
-      // G(z) / G(L) = 1 / (z * G(L)) ; z < eps, L = z + delta = delta
-      //    z * G(L) = z * G(lim) * (G(L)/G(lim)) ; lim = largest factorial
-      //
-      if(boost::math::max_factorial<T>::value < delta)
-      {
-         T ratio = tgamma_delta_ratio_imp_lanczos(delta, T(boost::math::max_factorial<T>::value - delta), pol, l);
-         ratio *= z;
-         ratio *= boost::math::unchecked_factorial<T>(boost::math::max_factorial<T>::value - 1);
-         return 1 / ratio;
-      }
-      else
-      {
-         #ifdef BOOST_MATH_HAS_NVRTC
-         if (boost::math::is_same_v<T, float>)
-         {
-            return 1 / (z * ::tgammaf(z + delta));
-         }
-         else
-         {
-            return 1 / (z * ::tgamma(z + delta));
-         }
-         #else
-         return 1 / (z * boost::math::tgamma(z + delta, pol));
-         #endif
-      }
-   }
+
   T zgh = static_cast<T>(z + T(Lanczos::g()) - constants::half<T>());
   T result;
   if(z + delta == z)
@@ -1806,6 +1773,50 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Poli
   result *= pow(T(constants::e<T>() / (zgh + delta)), delta);
   return result;
 }
+
+template <class T, class Policy, class Lanczos>
+BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l)
+{
+   BOOST_MATH_STD_USING
+
+   if(z < tools::epsilon<T>())
+   {
+      //
+      // We get spurious numeric overflow unless we're very careful, this
+      // can occur either inside Lanczos::lanczos_sum(z) or in the
+      // final combination of terms, to avoid this, split the product up
+      // into 2 (or 3) parts:
+      //
+      // G(z) / G(L) = 1 / (z * G(L)) ; z < eps, L = z + delta = delta
+      //    z * G(L) = z * G(lim) * (G(L)/G(lim)) ; lim = largest factorial
+      //
+      if(boost::math::max_factorial<T>::value < delta)
+      {
+         T ratio = tgamma_delta_ratio_imp_lanczos_final(T(delta), T(boost::math::max_factorial<T>::value - delta), pol, l);
+         ratio *= z;
+         ratio *= boost::math::unchecked_factorial<T>(boost::math::max_factorial<T>::value - 1);
+         return 1 / ratio;
+      }
+      else
+      {
+         #ifdef BOOST_MATH_HAS_NVRTC
+         if (boost::math::is_same_v<T, float>)
+         {
+            return 1 / (z * ::tgammaf(z + delta));
+         }
+         else
+         {
+            return 1 / (z * ::tgamma(z + delta));
+         }
+         #else
+         return 1 / (z * boost::math::tgamma(z + delta, pol));
+         #endif
+      }
+   }
+
+   return tgamma_delta_ratio_imp_lanczos_final(T(z), T(delta), pol, l);
+}
+
 //
 // And again without Lanczos support this time:
 //
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -26,7 +26,19 @@

 #include <boost/math/tools/config.hpp>

-#ifndef BOOST_MATH_HAS_NVRTC
+#ifdef BOOST_MATH_HAS_NVRTC
+
+namespace boost {
+namespace math {
+
+template <class RT1, class RT2, class A>
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<RT1, RT2, A>::type
+beta(RT1 a, RT2 b, A arg);
+
+} // namespace math
+} // namespace boost
+
+#else

 #include <vector>
 #include <complex>
@@ -154,9 +166,9 @@ namespace boost

   // Binomial:
   template <class T, class Policy>
-   T binomial_coefficient(unsigned n, unsigned k, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol);
   template <class T>
-   T binomial_coefficient(unsigned n, unsigned k);
+   BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k);

   // erf & erfc error functions.
   template <class RT> // Error function.
@@ -874,19 +886,19 @@ namespace boost
   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> cos_pi(T x);

   template <class T>
-   int fpclassify BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_MATH_GPU_ENABLED int fpclassify BOOST_NO_MACRO_EXPAND(T t);

   template <class T>
-   bool isfinite BOOST_NO_MACRO_EXPAND(T z);
+   BOOST_MATH_GPU_ENABLED bool isfinite BOOST_NO_MACRO_EXPAND(T z);

   template <class T>
-   bool isinf BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_MATH_GPU_ENABLED bool isinf BOOST_NO_MACRO_EXPAND(T t);

   template <class T>
-   bool isnan BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_MATH_GPU_ENABLED bool isnan BOOST_NO_MACRO_EXPAND(T t);

   template <class T>
-   bool isnormal BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_MATH_GPU_ENABLED bool isnormal BOOST_NO_MACRO_EXPAND(T t);

   template<class T>
   BOOST_MATH_GPU_ENABLED int signbit BOOST_NO_MACRO_EXPAND(T x);
@@ -1218,62 +1230,62 @@ namespace boost
   BOOST_MATH_DETAIL_11_FUNC(Policy)\
   \
   template <class RT1, class RT2>\
-   inline boost::math::tools::promote_args_t<RT1, RT2> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2> \
   beta(RT1 a, RT2 b) { return ::boost::math::beta(a, b, Policy()); }\
 \
   template <class RT1, class RT2, class A>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, A> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, A> \
   beta(RT1 a, RT2 b, A x){ return ::boost::math::beta(a, b, x, Policy()); }\
 \
   template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
   betac(RT1 a, RT2 b, RT3 x) { return ::boost::math::betac(a, b, x, Policy()); }\
 \
   template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
   ibeta(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibeta(a, b, x, Policy()); }\
 \
   template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
   ibetac(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibetac(a, b, x, Policy()); }\
 \
   template <class T1, class T2, class T3, class T4>\
-   inline boost::math::tools::promote_args_t<T1, T2, T3, T4>  \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3, T4>  \
   ibeta_inv(T1 a, T2 b, T3 p, T4* py){ return ::boost::math::ibeta_inv(a, b, p, py, Policy()); }\
 \
   template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
   ibeta_inv(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_inv(a, b, p, Policy()); }\
 \
   template <class T1, class T2, class T3, class T4>\
-   inline boost::math::tools::promote_args_t<T1, T2, T3, T4> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3, T4> \
   ibetac_inv(T1 a, T2 b, T3 q, T4* py){ return ::boost::math::ibetac_inv(a, b, q, py, Policy()); }\
 \
   template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
   ibeta_inva(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_inva(a, b, p, Policy()); }\
 \
   template <class T1, class T2, class T3>\
-   inline boost::math::tools::promote_args_t<T1, T2, T3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3> \
   ibetac_inva(T1 a, T2 b, T3 q){ return ::boost::math::ibetac_inva(a, b, q, Policy()); }\
 \
   template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
   ibeta_invb(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_invb(a, b, p, Policy()); }\
 \
   template <class T1, class T2, class T3>\
-   inline boost::math::tools::promote_args_t<T1, T2, T3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T1, T2, T3> \
   ibetac_invb(T1 a, T2 b, T3 q){ return ::boost::math::ibetac_invb(a, b, q, Policy()); }\
 \
   template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
   ibetac_inv(RT1 a, RT2 b, RT3 q){ return ::boost::math::ibetac_inv(a, b, q, Policy()); }\
 \
   template <class RT1, class RT2, class RT3>\
-   inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT1, RT2, RT3> \
   ibeta_derivative(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibeta_derivative(a, b, x, Policy()); }\
 \
-   template <class T> T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient<T, Policy>(n, k, Policy()); }\
+   template <class T> BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient<T, Policy>(n, k, Policy()); }\
 \
   template <class RT>\
   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<RT> erf(RT z) { return ::boost::math::erf(z, Policy()); }\
--- a/include/boost/math/tools/config.hpp
+++ b/include/boost/math/tools/config.hpp
@@ -800,7 +800,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return
 #define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast<T>(V)
 #define BOOST_MATH_FORCEINLINE __forceinline__
 #define BOOST_MATH_STD_USING  
-#define BOOST_MATH_IF_CONSTEXPR if constexpr
+#define BOOST_MATH_IF_CONSTEXPR if
 #define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point<T>::value)
 #define BOOST_MATH_CONSTEXPR_TABLE_FUNCTION constexpr
 #define BOOST_MATH_NO_EXCEPTIONS
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -25,6 +25,13 @@ run test_bernoulli_pdf_float.cu ;
 run test_bernoulli_range_support_double.cu ;
 run test_bernoulli_range_support_float.cu ;

+run test_beta_dist_cdf_double.cu ;
+run test_beta_dist_cdf_float.cu ;
+run test_beta_dist_pdf_double.cu ;
+run test_beta_dist_pdf_float.cu ;
+run test_beta_dist_quan_double.cu ;
+run test_beta_dist_quan_float.cu ;
+
 run test_cauchy_cdf_double.cu ;
 run test_cauchy_cdf_float.cu ;
 run test_cauchy_pdf_double.cu ;
@@ -107,6 +114,24 @@ run test_weibull_quan_float.cu ;
 # Special Functions
 run test_beta_double.cu ;
 run test_beta_float.cu ;
+run test_betac_double.cu ;
+run test_betac_float.cu ;
+run test_ibeta_double.cu ;
+run test_ibeta_float.cu ;
+run test_ibeta_derivative_double.cu ;
+run test_ibeta_derivative_float.cu ;
+run test_ibeta_inv_double.cu ;
+run test_ibeta_inv_float.cu ;
+run test_ibeta_inva_double.cu ;
+run test_ibeta_inva_float.cu ;
+run test_ibeta_invb_double.cu ;
+run test_ibeta_invb_float.cu ;
+run test_ibetac_inv_double.cu ;
+run test_ibetac_inv_float.cu ;
+run test_ibetac_inva_double.cu ;
+run test_ibetac_inva_float.cu ;
+run test_ibetac_invb_double.cu ;
+run test_ibetac_invb_float.cu ;

 run test_bessel_i0_double.cu ;
 run test_bessel_i0_float.cu ;
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -24,6 +24,13 @@ run test_bernoulli_pdf_nvrtc_float.cpp ;
 run test_bernoulli_quan_nvrtc_double.cpp ;
 run test_bernoulli_quan_nvrtc_float.cpp ;

+run test_beta_dist_cdf_nvrtc_double.cpp ;
+run test_beta_dist_cdf_nvrtc_float.cpp ;
+run test_beta_dist_pdf_nvrtc_double.cpp ;
+run test_beta_dist_pdf_nvrtc_float.cpp ;
+run test_beta_dist_quan_nvrtc_double.cpp ;
+run test_beta_dist_quan_nvrtc_float.cpp ;
+
 run test_cauchy_cdf_nvrtc_double.cpp ;
 run test_cauchy_cdf_nvrtc_float.cpp ;
 run test_cauchy_pdf_nvrtc_double.cpp ;
@@ -104,6 +111,26 @@ run test_weibull_quan_nvrtc_float.cpp ;
 # Special Functions
 run test_beta_nvrtc_double.cpp ;
 run test_beta_nvrtc_float.cpp ;
+run test_betac_nvrtc_double.cpp ;
+run test_betac_nvrtc_float.cpp ;
+run test_ibeta_nvrtc_double.cpp ;
+run test_ibeta_nvrtc_float.cpp ;
+run test_ibetac_nvrtc_double.cpp ;
+run test_ibetac_nvrtc_float.cpp ;
+run test_ibeta_derivative_nvrtc_double.cpp ;
+run test_ibeta_derivative_nvrtc_float.cpp ;
+run test_ibeta_inv_nvrtc_double.cpp ;
+run test_ibeta_inv_nvrtc_float.cpp ;
+run test_ibeta_inva_nvrtc_double.cpp ;
+run test_ibeta_inva_nvrtc_float.cpp ;
+run test_ibeta_invb_nvrtc_double.cpp ;
+run test_ibeta_invb_nvrtc_float.cpp ;
+run test_ibetac_inv_nvrtc_double.cpp ;
+run test_ibetac_inv_nvrtc_float.cpp ;
+run test_ibetac_inva_nvrtc_double.cpp ;
+run test_ibetac_inva_nvrtc_float.cpp ;
+run test_ibetac_invb_nvrtc_double.cpp ;
+run test_ibetac_invb_nvrtc_float.cpp ;

 run test_bessel_i0_nvrtc_double.cpp ;
 run test_bessel_i0_nvrtc_float.cpp ;
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -12,6 +12,7 @@ project : requirements
 # Distributions
 run test_arcsine.cpp ;
 run test_bernoulli.cpp ;
+run test_beta_dist.cpp ;
 run test_cauchy.cpp ;
 run test_chi_squared.cpp ;
 run test_exponential_dist.cpp ;
@@ -28,6 +29,10 @@ run test_weibull.cpp ;
 run pow_test.cpp ;

 run test_beta_simple.cpp ;
+run test_beta.cpp ;
+run test_ibeta.cpp ;
+run test_ibeta_inv.cpp ;
+run test_ibeta_inv_ab.cpp ;

 run test_bessel_i.cpp ;
 run test_bessel_j.cpp ;
--- a/test/test_bessel_i.cpp
+++ b/test/test_bessel_i.cpp
@@ -15,7 +15,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif

 #include "test_bessel_i.hpp"
--- a/test/test_bessel_j.cpp
+++ b/test/test_bessel_j.cpp
@@ -15,7 +15,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif

 #include "test_bessel_j.hpp"
--- a/test/test_bessel_k.cpp
+++ b/test/test_bessel_k.cpp
@@ -22,7 +22,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif

 #include "test_bessel_k.hpp"
--- a/test/test_bessel_y.cpp
+++ b/test/test_bessel_y.cpp
@@ -15,7 +15,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif

 #include "test_bessel_y.hpp"
--- a/test/test_beta.cpp
+++ b/test/test_beta.cpp
@@ -5,7 +5,17 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

-#include "pch_light.hpp"
+#ifndef SYCL_LANGUAGE_VERSION
+#include <pch_light.hpp>
+#endif
+
+#ifdef __clang__
+#  pragma clang diagnostic push 
+#  pragma clang diagnostic ignored "-Wliteral-range"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push 
+#  pragma GCC diagnostic ignored "-Woverflow"
+#endif

 #include "test_beta.hpp"

--- a/test/test_beta.hpp
+++ b/test/test_beta.hpp
@@ -18,9 +18,10 @@
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
+#include <boost/math/special_functions/beta.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
@@ -109,9 +110,12 @@ void test_spots(T)
      // Inexact input, so disable for ultra precise long doubles:
      BOOST_CHECK_CLOSE(::boost::math::beta(static_cast<T>(0.0125L), static_cast<T>(0.000023L)), static_cast<T>(43558.24045647538375006349016083320744662L), tolerance * 2);
   }
+
+   #ifndef BOOST_MATH_NO_EXCEPTIONS
   BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(0), static_cast<T>(1)), std::domain_error);
   BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(-1), static_cast<T>(1)), std::domain_error);
   BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(1), static_cast<T>(-1)), std::domain_error);
   BOOST_CHECK_THROW(boost::math::beta(static_cast<T>(1), static_cast<T>(0)), std::domain_error);
+   #endif
 }

--- a/test/test_beta_dist.cpp
+++ b/test/test_beta_dist.cpp
@@ -32,9 +32,14 @@
 # pragma warning (disable : 4224) // nonstandard extension used : formal parameter 'arg' was previously defined as a type.
 #endif

+#include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
 using ::boost::math::concepts::real_concept;
-#include <boost/math/tools/test.hpp>
+#endif
+
+#include "../include_private/boost/math/tools/test.hpp"

 #include <boost/math/distributions/beta.hpp> // for beta_distribution
 using boost::math::beta_distribution;
@@ -634,12 +639,13 @@ BOOST_AUTO_TEST_CASE( test_main )
   BOOST_CHECK_CLOSE_FRACTION(mybeta22.find_alpha(mybeta22.beta(), 0.8, cdf(mybeta22, 0.8)), mybeta22.alpha(), tol);
   BOOST_CHECK_CLOSE_FRACTION(mybeta22.find_beta(mybeta22.alpha(), 0.8, cdf(mybeta22, 0.8)), mybeta22.beta(), tol);

-
+   #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
   beta_distribution<real_concept> rcbeta22(2, 2); // Using RealType real_concept.
   cout << "numeric_limits<real_concept>::is_specialized " << numeric_limits<real_concept>::is_specialized << endl;
   cout << "numeric_limits<real_concept>::digits " << numeric_limits<real_concept>::digits << endl;
   cout << "numeric_limits<real_concept>::digits10 " << numeric_limits<real_concept>::digits10 << endl;
   cout << "numeric_limits<real_concept>::epsilon " << numeric_limits<real_concept>::epsilon() << endl;
+   #endif

   // (Parameter value, arbitrarily zero, only communicates the floating point type).
   test_spots(0.0F); // Test float.
--- a/test/test_beta_dist_cdf_double.cu
+++ b/test/test_beta_dist_cdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
--- a/test/test_beta_dist_cdf_float.cu
+++ b/test/test_beta_dist_cdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
--- a/test/test_beta_dist_cdf_nvrtc_double.cpp
+++ b/test/test_beta_dist_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_beta_dist_cdf_nvrtc_float.cpp
+++ b/test/test_beta_dist_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_beta_dist_pdf_double.cu
+++ b/test/test_beta_dist_pdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
--- a/test/test_beta_dist_pdf_float.cu
+++ b/test/test_beta_dist_pdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
--- a/test/test_beta_dist_pdf_nvrtc_double.cpp
+++ b/test/test_beta_dist_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_beta_dist_pdf_nvrtc_float.cpp
+++ b/test/test_beta_dist_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_beta_dist_quan_double.cu
+++ b/test/test_beta_dist_quan_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
--- a/test/test_beta_dist_quan_float.cu
+++ b/test/test_beta_dist_quan_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::beta_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
--- a/test/test_beta_dist_quan_nvrtc_double.cpp
+++ b/test/test_beta_dist_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_beta_dist_quan_nvrtc_float.cpp
+++ b/test/test_beta_dist_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/beta.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/beta.hpp>
+extern "C" __global__ 
+void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::beta_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_dist_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::beta_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_betac_double.cu
+++ b/test/test_betac_double.cu
@@ -0,0 +1,146 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "beta_med_data.ipp"
+#include "beta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < beta_med_data.size(); ++i)
+    {
+       v1.push_back(beta_med_data[i][0]);
+       v2.push_back(beta_med_data[i][1]);
+       v3.push_back(beta_med_data[i][2]);
+    }
+    for(unsigned i = 0; i < beta_small_data.size(); ++i)
+    {
+       v1.push_back(beta_small_data[i][0]);
+       v2.push_back(beta_small_data[i][1]);
+       v3.push_back(beta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::betac(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_betac_float.cu
+++ b/test/test_betac_float.cu
@@ -0,0 +1,146 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "beta_med_data.ipp"
+#include "beta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < beta_med_data.size(); ++i)
+    {
+       v1.push_back(beta_med_data[i][0]);
+       v2.push_back(beta_med_data[i][1]);
+       v3.push_back(beta_med_data[i][2]);
+    }
+    for(unsigned i = 0; i < beta_small_data.size(); ++i)
+    {
+       v1.push_back(beta_small_data[i][0]);
+       v2.push_back(beta_small_data[i][1]);
+       v3.push_back(beta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::betac(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_betac_nvrtc_double.cpp
+++ b/test/test_betac_nvrtc_double.cpp
@@ -0,0 +1,196 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_beta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+            h_in3[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::betac(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_betac_nvrtc_float.cpp
+++ b/test/test_betac_nvrtc_float.cpp
@@ -0,0 +1,196 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_beta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::betac(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_beta_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+            h_in3[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::betac(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_erf.cpp
+++ b/test/test_erf.cpp
@@ -13,7 +13,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif

 #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
--- a/test/test_holtsmark.cpp
+++ b/test/test_holtsmark.cpp
@@ -21,7 +21,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif

 using boost::math::holtsmark_distribution;
--- a/test/test_ibeta.cpp
+++ b/test/test_ibeta.cpp
@@ -3,7 +3,18 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
+
+#ifdef __clang__
+#  pragma clang diagnostic push 
+#  pragma clang diagnostic ignored "-Wliteral-range"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push 
+#  pragma GCC diagnostic ignored "-Woverflow"
+#endif
+
 #include "test_ibeta.hpp"

 #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT)
--- a/test/test_ibeta.hpp
+++ b/test/test_ibeta.hpp
@@ -8,9 +8,10 @@
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
+#include <boost/math/special_functions/beta.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
--- a/test/test_ibeta_derivative.cpp
+++ b/test/test_ibeta_derivative.cpp
@@ -4,7 +4,7 @@
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 #if defined(__GNUC__) && __GNUC__ <= 12
 #pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wliteral-range"
+#pragma GCC diagnostic ignored "-Woverflow"
 #endif
 #include <pch_light.hpp>
 #include "test_ibeta_derivative.hpp"
--- a/test/test_ibeta_derivative_double.cu
+++ b/test/test_ibeta_derivative_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibeta_derivative_float.cu
+++ b/test/test_ibeta_derivative_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibeta_derivative_nvrtc_double.cpp
+++ b/test/test_ibeta_derivative_nvrtc_double.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_derivative_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_derivative_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_derivative_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_derivative_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta_derivative(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibeta_derivative_nvrtc_float.cpp
+++ b/test/test_ibeta_derivative_nvrtc_float.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_derivative_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_derivative_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_derivative_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_derivative_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta_derivative(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibeta_double.cu
+++ b/test/test_ibeta_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibeta_float.cu
+++ b/test/test_ibeta_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibeta_inv.cpp
+++ b/test/test_ibeta_inv.cpp
@@ -3,7 +3,18 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
+
+#ifdef __clang__
+#  pragma clang diagnostic push 
+#  pragma clang diagnostic ignored "-Wliteral-range"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push 
+#  pragma GCC diagnostic ignored "-Woverflow"
+#endif
+
 #include"test_ibeta_inv.hpp"

 #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT)
--- a/test/test_ibeta_inv.hpp
+++ b/test/test_ibeta_inv.hpp
@@ -8,10 +8,11 @@
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
+#include <boost/math/special_functions/beta.hpp>
 #include <boost/math/special_functions/next.hpp>  // for has_denorm_now
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
@@ -306,6 +307,7 @@ void test_spots(T)
      BOOST_MATH_CHECK_THROW(::boost::math::ibeta_inv(static_cast<T>(2.125), -n, static_cast<T>(0.125)), std::domain_error);
      BOOST_MATH_CHECK_THROW(::boost::math::ibeta_inv(static_cast<T>(2.125), static_cast<T>(1.125), -n), std::domain_error);
   }
+   #ifndef SYCL_LANGUAGE_VERSION
   if (boost::math::detail::has_denorm_now<T>())
   {
      T m = std::numeric_limits<T>::denorm_min();
@@ -317,5 +319,6 @@ void test_spots(T)
      BOOST_CHECK((boost::math::isfinite)(boost::math::ibeta_inv(static_cast<T>(12.125), m, static_cast<T>(0.125))));
      BOOST_CHECK((boost::math::isfinite)(boost::math::ibeta_inv(m, m, static_cast<T>(0.125))));
   }
+   #endif
 }

--- a/test/test_ibeta_inv_ab.cpp
+++ b/test/test_ibeta_inv_ab.cpp
@@ -3,7 +3,18 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
+
+#ifdef __clang__
+#  pragma clang diagnostic push 
+#  pragma clang diagnostic ignored "-Wliteral-range"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push 
+#  pragma GCC diagnostic ignored "-Woverflow"
+#endif
+
 #include "test_ibeta_inv_ab.hpp"

 #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT)
--- a/test/test_ibeta_inv_ab.hpp
+++ b/test/test_ibeta_inv_ab.hpp
@@ -10,9 +10,10 @@
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
+#include <boost/math/special_functions/beta.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
--- a/test/test_ibeta_inv_double.cu
+++ b/test/test_ibeta_inv_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibeta_inv_float.cu
+++ b/test/test_ibeta_inv_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibeta_inv_nvrtc_double.cpp
+++ b/test/test_ibeta_inv_nvrtc_double.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inv_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_inv_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inv_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta_inv(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibeta_inv_nvrtc_float.cpp
+++ b/test/test_ibeta_inv_nvrtc_float.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inv_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_inv_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inv_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta_inv(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibeta_inva_double.cu
+++ b/test/test_ibeta_inva_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibeta_inva_float.cu
+++ b/test/test_ibeta_inva_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibeta_inva_nvrtc_double.cpp
+++ b/test/test_ibeta_inva_nvrtc_double.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inva_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_inva_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inva_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibeta_inva(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibeta_inva_nvrtc_float.cpp
+++ b/test/test_ibeta_inva_nvrtc_float.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inva_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_inva_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inva_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibeta_inva(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibeta_invb_double.cu
+++ b/test/test_ibeta_invb_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibeta_invb_float.cu
+++ b/test/test_ibeta_invb_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibeta_invb_nvrtc_double.cpp
+++ b/test/test_ibeta_invb_nvrtc_double.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_invb_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_invb_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_invb_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibeta_invb(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibeta_invb_nvrtc_float.cpp
+++ b/test/test_ibeta_invb_nvrtc_float.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_invb_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_invb_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_invb_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibeta_invb(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibeta_nvrtc_double.cpp
+++ b/test/test_ibeta_nvrtc_double.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibeta_nvrtc_float.cpp
+++ b/test/test_ibeta_nvrtc_float.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibeta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibeta_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibeta(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibetac_inv_double.cu
+++ b/test/test_ibetac_inv_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibetac_inv_float.cu
+++ b/test/test_ibetac_inv_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_inv(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibetac_inv_nvrtc_double.cpp
+++ b/test/test_ibetac_inv_nvrtc_double.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inv_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_inv_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inv_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibetac_inv(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibetac_inv_nvrtc_float.cpp
+++ b/test/test_ibetac_inv_nvrtc_float.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inv_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_inv_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inv_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibetac_inv(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibetac_inva_double.cu
+++ b/test/test_ibetac_inva_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibetac_inva_float.cu
+++ b/test/test_ibetac_inva_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_inva(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibetac_inva_nvrtc_double.cpp
+++ b/test/test_ibetac_inva_nvrtc_double.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inva_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_inva_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inva_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibetac_inva(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibetac_inva_nvrtc_float.cpp
+++ b/test/test_ibetac_inva_nvrtc_float.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inva_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_inva_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inva_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibetac_inva(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibetac_invb_double.cu
+++ b/test/test_ibetac_invb_double.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibetac_invb_float.cu
+++ b/test/test_ibetac_invb_float.cu
@@ -0,0 +1,149 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// floating-point value does not fit in required floating-point type
+#pragma nv_diag_suppress 221
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_data.size(); ++i)
+    {
+       v1.push_back(ibeta_data[i][0]);
+       v2.push_back(ibeta_data[i][1]);
+       v3.push_back(ibeta_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_small_data[i][0]);
+       v2.push_back(ibeta_small_data[i][1]);
+       v3.push_back(ibeta_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibetac_invb(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::isfinite(output_vector[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
+
--- a/test/test_ibetac_invb_nvrtc_double.cpp
+++ b/test/test_ibetac_invb_nvrtc_double.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_invb_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_invb_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_invb_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibetac_invb(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibetac_invb_nvrtc_float.cpp
+++ b/test/test_ibetac_invb_nvrtc_float.cpp
@@ -0,0 +1,220 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_invb_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_invb_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_invb_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            // Sometimes the ignore error policy is ignored so the below throws
+            // Rather than terminating we can continue to process through our results array
+            double res;
+            try
+            {
+                res = boost::math::ibetac_invb(h_in1[i], h_in2[i], h_in3[i]);
+            }
+            catch (...)
+            {
+                continue;
+            }
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibetac_nvrtc_double.cpp
+++ b/test/test_ibetac_nvrtc_double.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibetac(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_ibetac_nvrtc_float.cpp
+++ b/test/test_ibetac_nvrtc_float.cpp
@@ -0,0 +1,207 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/beta.hpp>
+extern "C" __global__ 
+void test_ibetac_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibetac(in1[i], in2[i], in3[i]);
+    }
+}
+)";
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "ibeta_data.ipp"
+#include "ibeta_small_data.ipp"
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_ibetac_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_kernel"), "Failed to get kernel function");
+
+        int numElements = ibeta_data.size() + ibeta_small_data.size();
+        float_type *h_in1, *h_in2, *h_in3, *h_out;
+        float_type *d_in1, *d_in2, *d_in3, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_in3 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        for (int i = 0; i < ibeta_data.size(); ++i) 
+        {
+            h_in1[i] = ibeta_data[i][0];
+            h_in2[i] = ibeta_data[i][1];
+            h_in3[i] = ibeta_data[i][2];
+        }
+        for (int i = 0; i < ibeta_small_data.size(); ++i)
+        {
+            h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0];
+            h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1];
+            h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2];
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+        checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::ibetac(h_in1[i], h_in2[i], h_in3[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_in3);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_in3;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_igamma.cpp
+++ b/test/test_igamma.cpp
@@ -12,7 +12,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif

 #include "test_igamma.hpp"
--- a/test/test_igamma_inv.cpp
+++ b/test/test_igamma_inv.cpp
@@ -12,7 +12,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif
 #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
 #include "test_igamma_inv.hpp"
--- a/test/test_igamma_inva.cpp
+++ b/test/test_igamma_inva.cpp
@@ -12,7 +12,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif

 #include "test_igamma_inva.hpp"
--- a/test/test_landau.cpp
+++ b/test/test_landau.cpp
@@ -20,7 +20,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif

 using boost::math::landau_distribution;
--- a/test/test_mapairy.cpp
+++ b/test/test_mapairy.cpp
@@ -9,7 +9,7 @@
 #  pragma clang diagnostic ignored "-Wliteral-range"
 #elif defined(__GNUC__)
 #  pragma GCC diagnostic push 
-#  pragma GCC diagnostic ignored "-Wliteral-range"
+#  pragma GCC diagnostic ignored "-Woverflow"
 #endif

 #define BOOST_TEST_MAIN