Add GPU support to expint

Add SYCL testing of expint Add markers to forward decls Add CUDA testing of expint Fix static variable usage under NVRTC Add NVRTC testing Add configurable definition of complex Add function aliases Add GPU support to gegenbauer polynomials Add SYCL testing of gegenbauer Add NVCC testing of gegenbauer Add NVRTC testing of gegenbauer Add GPU support for hankel Add SYCL testing of hankel Add NVCC testing of cyl_hankel_1 Add comprehensive NVCC testing Add NVRTC testing of cyl and sph hankel Update docs Fix writing cuda::std::complex<T> to stdout Add GPU support to hermite Add SYCL testing of hermite Add CUDA testing of hermite Add NVRTC testing of hermite Add markers to hermite docs
2026-01-19 04:22:09 +00:00 · 2024-09-17 15:23:54 -04:00
parent c3afa49c9a
commit bbb8eee1d6
49 changed files with 4852 additions and 246 deletions
--- a/doc/sf/airy.qbk
+++ b/doc/sf/airy.qbk
@@ -18,10 +18,10 @@ LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  namespace boost { namespace math {

   template <class T>
-   ``__sf_result`` airy_ai(T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai(T x);

   template <class T, class Policy>
-   ``__sf_result`` airy_ai(T x, const Policy&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai(T x, const Policy&);

  }} // namespaces
  
@@ -78,10 +78,10 @@ This function is implemented in terms of the Bessel functions using the relation
  namespace boost { namespace math {

   template <class T>
-   ``__sf_result`` airy_bi(T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi(T x);

   template <class T, class Policy>
-   ``__sf_result`` airy_bi(T x, const Policy&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi(T x, const Policy&);

  }} // namespaces
  
@@ -132,10 +132,10 @@ This function is implemented in terms of the Bessel functions using the relation
  namespace boost { namespace math {

   template <class T>
-   ``__sf_result`` airy_ai_prime(T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai_prime(T x);

   template <class T, class Policy>
-   ``__sf_result`` airy_ai_prime(T x, const Policy&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_ai_prime(T x, const Policy&);

  }} // namespaces
  
@@ -186,10 +186,10 @@ This function is implemented in terms of the Bessel functions using the relation
  namespace boost { namespace math {

   template <class T>
-   ``__sf_result`` airy_bi_prime(T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi_prime(T x);

   template <class T, class Policy>
-   ``__sf_result`` airy_bi_prime(T x, const Policy&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` airy_bi_prime(T x, const Policy&);

  }} // namespaces
  
@@ -242,23 +242,23 @@ by providing an output iterator.
 The signature of the single value functions are:

  template <class T>
-  T airy_ai_zero(
+  BOOST_MATH_GPU_ENABLED T airy_ai_zero(
           int m);         // 1-based index of zero.

  template <class T>
-  T airy_bi_zero(
+  BOOST_MATH_GPU_ENABLED T airy_bi_zero(
           int m);         // 1-based index of zero.

 and for multiple zeros:

 template <class T, class OutputIterator>
- OutputIterator airy_ai_zero(
+ BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero(
                      int start_index,           // 1-based index of first zero.
                      unsigned number_of_zeros,  // How many zeros to generate.
                      OutputIterator out_it);    // Destination for zeros.

 template <class T, class OutputIterator>
- OutputIterator airy_bi_zero(
+ BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero(
                      int start_index,           // 1-based index of zero.
                      unsigned number_of_zeros,  // How many zeros to generate
                      OutputIterator out_it);    // Destination for zeros.
@@ -266,25 +266,25 @@ and for multiple zeros:
 There are also versions which allow control of the __policy_section for error handling and precision.

  template <class T>
-  T airy_ai_zero(
+  BOOST_MATH_GPU_ENABLED T airy_ai_zero(
           int m,          // 1-based index of zero.
           const Policy&); // Policy to use.

  template <class T>
-  T airy_bi_zero(
+  BOOST_MATH_GPU_ENABLED T airy_bi_zero(
           int m,          // 1-based index of zero.
           const Policy&); // Policy to use.


 template <class T, class OutputIterator>
- OutputIterator airy_ai_zero(
+ BOOST_MATH_GPU_ENABLED OutputIterator airy_ai_zero(
                      int start_index,           // 1-based index of first zero.
                      unsigned number_of_zeros,  // How many zeros to generate.
                      OutputIterator out_it,     // Destination for zeros.
                      const Policy& pol);        // Policy to use.

 template <class T, class OutputIterator>
- OutputIterator airy_bi_zero(
+ BOOST_MATH_GPU_ENABLED OutputIterator airy_bi_zero(
                      int start_index,           // 1-based index of zero.
                      unsigned number_of_zeros,  // How many zeros to generate.
                      OutputIterator out_it,     // Destination for zeros.
--- a/doc/sf/expint.qbk
+++ b/doc/sf/expint.qbk
@@ -11,10 +11,10 @@
   namespace boost{ namespace math{
   
   template <class T>
-   ``__sf_result`` expint(unsigned n, T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z);
   
   template <class T, class ``__Policy``>
-   ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
   
   }} // namespaces
   
@@ -26,10 +26,10 @@ the return type is `double` if T is an integer type, and T otherwise.
 [h4 Description]

   template <class T>
-   ``__sf_result`` expint(unsigned n, T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z);
   
   template <class T, class ``__Policy``>
-   ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(unsigned n, T z, const ``__Policy``&);
   
 Returns the [@http://mathworld.wolfram.com/En-Function.html exponential integral En]
 of z:
@@ -100,10 +100,10 @@ is used.
   namespace boost{ namespace math{
   
   template <class T>
-   ``__sf_result`` expint(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z);
   
   template <class T, class ``__Policy``>
-   ``__sf_result`` expint(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z, const ``__Policy``&);
   
   }} // namespaces
   
@@ -115,10 +115,10 @@ the return type is `double` if T is an integer type, and T otherwise.
 [h4 Description]

   template <class T>
-   ``__sf_result`` expint(T z);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z);
   
   template <class T, class ``__Policy``>
-   ``__sf_result`` expint(T z, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` expint(T z, const ``__Policy``&);
   
 Returns the [@http://mathworld.wolfram.com/ExponentialIntegral.html exponential integral]
 of z:
--- a/doc/sf/gegenbauer.qbk
+++ b/doc/sf/gegenbauer.qbk
@@ -16,13 +16,13 @@
   namespace boost{ namespace math{

   template<typename Real>
-   Real gegenbauer(unsigned n, Real lambda, Real x);
+   BOOST_MATH_GPU_ENABLED Real gegenbauer(unsigned n, Real lambda, Real x);

   template<typename Real>
-   Real gegenbauer_prime(unsigned n, Real lambda, Real x);
+   BOOST_MATH_GPU_ENABLED Real gegenbauer_prime(unsigned n, Real lambda, Real x);

   template<typename Real>
-   Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k);
+   BOOST_MATH_GPU_ENABLED Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k);

   }} // namespaces

--- a/doc/sf/hankel.qbk
+++ b/doc/sf/hankel.qbk
@@ -3,18 +3,36 @@

 [h4 Synopsis]

-   template <class T1, class T2>
-   std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x);
-
-   template <class T1, class T2, class ``__Policy``>
-   std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&);
+   #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__)

   template <class T1, class T2>
-   std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x);
+
+   template <class T1, class T2, class ``__Policy``>
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&);
+
+   template <class T1, class T2>
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x);
   
   template <class T1, class T2, class ``__Policy``>
-   std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&);
   
+   #else // When using cuda we use namespace cuda::std:: instead of std::
+
+   template <class T1, class T2>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x);
+
+   template <class T1, class T2, class ``__Policy``>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_1(T1 v, T2 x, const ``__Policy``&);
+
+   template <class T1, class T2>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x);
+   
+   template <class T1, class T2, class ``__Policy``>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> cyl_hankel_2(T1 v, T2 x, const ``__Policy``&);
+
+   #endif
+
   
 [h4 Description]

@@ -77,18 +95,35 @@ routines for integer order are used.

 [h4 Synopsis]

-   template <class T1, class T2>
-   std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x);
-
-   template <class T1, class T2, class ``__Policy``>
-   std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&);
+   #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__)

   template <class T1, class T2>
-   std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x);
+
+   template <class T1, class T2, class ``__Policy``>
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&);
+
+   template <class T1, class T2>
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x);
   
   template <class T1, class T2, class ``__Policy``>
-   std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&);
   
+   #else // When using cuda we use namespace cuda::std:: instead of std::
+
+   template <class T1, class T2>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x);
+
+   template <class T1, class T2, class ``__Policy``>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_1(T1 v, T2 x, const ``__Policy``&);
+
+   template <class T1, class T2>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x);
+   
+   template <class T1, class T2, class ``__Policy``>
+   BOOST_MATH_GPU_ENABLED cuda::std::complex<``__sf_result``> sph_hankel_2(T1 v, T2 x, const ``__Policy``&);
+
+   #endif
   
 [h4 Description]

--- a/doc/sf/hermite.qbk
+++ b/doc/sf/hermite.qbk
@@ -9,13 +9,13 @@
   namespace boost{ namespace math{
   
   template <class T>
-   ``__sf_result`` hermite(unsigned n, T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x);
   
   template <class T, class ``__Policy``>
-   ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
   
   template <class T1, class T2, class T3>
-   ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
      
   }} // namespaces

@@ -26,10 +26,10 @@ note than when there is a single template argument the result is the same type
 as that argument or `double` if the template argument is an integer type.

   template <class T>
-   ``__sf_result`` hermite(unsigned n, T x);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x);
   
   template <class T, class ``__Policy``>
-   ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite(unsigned n, T x, const ``__Policy``&);
   
 Returns the value of the Hermite Polynomial of order /n/ at point /x/:

@@ -43,7 +43,7 @@ Hermite Polynomials:
 [graph hermite]
   
   template <class T1, class T2, class T3>
-   ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
+   BOOST_MATH_GPU_ENABLED ``__sf_result`` hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);
   
 Implements the three term recurrence relation for the Hermite
 polynomials, this function can be used to create a sequence of
--- a/include/boost/math/special_functions/bessel.hpp
+++ b/include/boost/math/special_functions/bessel.hpp
@@ -47,50 +47,6 @@

 namespace boost{ namespace math{

-// Since we cannot pull this in from math fwd we need a copy
-#ifdef BOOST_MATH_HAS_NVRTC
-
-namespace detail{
-
-      typedef boost::math::integral_constant<int, 0> bessel_no_int_tag;      // No integer optimisation possible.
-      typedef boost::math::integral_constant<int, 1> bessel_maybe_int_tag;   // Maybe integer optimisation.
-      typedef boost::math::integral_constant<int, 2> bessel_int_tag;         // Definite integer optimisation.
-
-      template <class T1, class T2, class Policy>
-      struct bessel_traits
-      {
-         using result_type = typename boost::math::conditional<
-            boost::math::is_integral<T1>::value,
-            typename tools::promote_args<T2>::type,
-            tools::promote_args_t<T1, T2>
-         >::type;
-
-         typedef typename policies::precision<result_type, Policy>::type precision_type;
-
-         using optimisation_tag = typename boost::math::conditional<
-            (precision_type::value <= 0 || precision_type::value > 64),
-            bessel_no_int_tag,
-            typename boost::math::conditional<
-               boost::math::is_integral<T1>::value,
-               bessel_int_tag,
-               bessel_maybe_int_tag
-            >::type
-         >::type;
-
-         using optimisation_tag128 = typename boost::math::conditional<
-            (precision_type::value <= 0 || precision_type::value > 113),
-            bessel_no_int_tag,
-            typename boost::math::conditional<
-               boost::math::is_integral<T1>::value,
-               bessel_int_tag,
-               bessel_maybe_int_tag
-            >::type
-         >::type;
-      };
-   } // detail
-
-#endif
-
 namespace detail{

 template <class T, class Policy>
--- a/include/boost/math/special_functions/expint.hpp
+++ b/include/boost/math/special_functions/expint.hpp
@@ -1,4 +1,5 @@
 //  Copyright John Maddock 2007.
+//  Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -12,6 +13,10 @@
 #pragma warning(disable:4702) // Unreachable code (release mode only warning)
 #endif

+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/cstdint.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/tools/precision.hpp>
 #include <boost/math/tools/promotion.hpp>
 #include <boost/math/tools/fraction.hpp>
@@ -20,7 +25,6 @@
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/digamma.hpp>
 #include <boost/math/special_functions/log1p.hpp>
-#include <boost/math/special_functions/pow.hpp>

 #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
 //
@@ -35,13 +39,13 @@
 namespace boost{ namespace math{

 template <class T, class Policy>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
   expint(unsigned n, T z, const Policy& /*pol*/);

 namespace detail{

 template <class T>
-inline T expint_1_rational(const T& z, const std::integral_constant<int, 0>&)
+BOOST_MATH_GPU_ENABLED inline T expint_1_rational(const T& z, const boost::math::integral_constant<int, 0>&)
 {
   // this function is never actually called
   BOOST_MATH_ASSERT(0);
@@ -49,7 +53,7 @@ inline T expint_1_rational(const T& z, const std::integral_constant<int, 0>&)
 }

 template <class T>
-T expint_1_rational(const T& z, const std::integral_constant<int, 53>&)
+BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant<int, 53>&)
 {
   BOOST_MATH_STD_USING
   T result;
@@ -123,7 +127,7 @@ T expint_1_rational(const T& z, const std::integral_constant<int, 53>&)
 }

 template <class T>
-T expint_1_rational(const T& z, const std::integral_constant<int, 64>&)
+BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant<int, 64>&)
 {
   BOOST_MATH_STD_USING
   T result;
@@ -204,7 +208,7 @@ T expint_1_rational(const T& z, const std::integral_constant<int, 64>&)
 }

 template <class T>
-T expint_1_rational(const T& z, const std::integral_constant<int, 113>&)
+BOOST_MATH_GPU_ENABLED T expint_1_rational(const T& z, const boost::math::integral_constant<int, 113>&)
 {
   BOOST_MATH_STD_USING
   T result;
@@ -351,14 +355,15 @@ T expint_1_rational(const T& z, const std::integral_constant<int, 113>&)
   return result;
 }

+
 template <class T>
 struct expint_fraction
 {
-   typedef std::pair<T,T> result_type;
-   expint_fraction(unsigned n_, T z_) : b(n_ + z_), i(-1), n(n_){}
-   std::pair<T,T> operator()()
+   typedef boost::math::pair<T,T> result_type;
+   BOOST_MATH_GPU_ENABLED expint_fraction(unsigned n_, T z_) : b(n_ + z_), i(-1), n(n_){}
+   BOOST_MATH_GPU_ENABLED boost::math::pair<T,T> operator()()
   {
-      std::pair<T,T> result = std::make_pair(-static_cast<T>((i+1) * (n+i)), b);
+      boost::math::pair<T,T> result = boost::math::make_pair(-static_cast<T>((i+1) * (n+i)), b);
      b += 2;
      ++i;
      return result;
@@ -370,11 +375,11 @@ private:
 };

 template <class T, class Policy>
-inline T expint_as_fraction(unsigned n, T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T expint_as_fraction(unsigned n, T z, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   BOOST_MATH_INSTRUMENT_VARIABLE(z)
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
   expint_fraction<T> f(n, z);
   T result = tools::continued_fraction_b(
      f,
@@ -392,9 +397,9 @@ template <class T>
 struct expint_series
 {
   typedef T result_type;
-   expint_series(unsigned k_, T z_, T x_k_, T denom_, T fact_)
+   BOOST_MATH_GPU_ENABLED expint_series(unsigned k_, T z_, T x_k_, T denom_, T fact_)
      : k(k_), z(z_), x_k(x_k_), denom(denom_), fact(fact_){}
-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
   {
      x_k *= -z;
      denom += 1;
@@ -410,10 +415,10 @@ private:
 };

 template <class T, class Policy>
-inline T expint_as_series(unsigned n, T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T expint_as_series(unsigned n, T z, const Policy& pol)
 {
   BOOST_MATH_STD_USING
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();

   BOOST_MATH_INSTRUMENT_VARIABLE(z)

@@ -443,10 +448,10 @@ inline T expint_as_series(unsigned n, T z, const Policy& pol)
 }

 template <class T, class Policy, class Tag>
-T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag)
+BOOST_MATH_GPU_ENABLED T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag)
 {
   BOOST_MATH_STD_USING
-   static const char* function = "boost::math::expint<%1%>(unsigned, %1%)";
+   constexpr auto function = "boost::math::expint<%1%>(unsigned, %1%)";
   if(z < 0)
      return policies::raise_domain_error<T>(function, "Function requires z >= 0 but got %1%.", z, pol);
   if(z == 0)
@@ -468,15 +473,21 @@ T expint_imp(unsigned n, T z, const Policy& pol, const Tag& tag)
 #  pragma warning(disable:4127) // conditional expression is constant
 #endif
   if(n == 0)
+   {
      result = exp(-z) / z;
+   }
   else if((n == 1) && (Tag::value))
   {
      result = expint_1_rational(z, tag);
   }
   else if(f)
+   {
      result = expint_as_series(n, z, pol);
+   }
   else
+   {
      result = expint_as_fraction(n, z, pol);
+   }
 #ifdef _MSC_VER
 #  pragma warning(pop)
 #endif
@@ -488,8 +499,8 @@ template <class T>
 struct expint_i_series
 {
   typedef T result_type;
-   expint_i_series(T z_) : k(0), z_k(1), z(z_){}
-   T operator()()
+   BOOST_MATH_GPU_ENABLED expint_i_series(T z_) : k(0), z_k(1), z(z_){}
+   BOOST_MATH_GPU_ENABLED T operator()()
   {
      z_k *= z / ++k;
      return z_k / k;
@@ -501,22 +512,22 @@ private:
 };

 template <class T, class Policy>
-T expint_i_as_series(T z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T expint_i_as_series(T z, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   T result = log(z); // (log(z) - log(1 / z)) / 2;
   result += constants::euler<T>();
   expint_i_series<T> s(z);
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
   result = tools::sum_series(s, policies::get_epsilon<T, Policy>(), max_iter, result);
   policies::check_series_iterations<T>("boost::math::expint_i_series<%1%>(%1%)", max_iter, pol);
   return result;
 }

 template <class T, class Policy, class Tag>
-T expint_i_imp(T z, const Policy& pol, const Tag& tag)
+BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const Tag& tag)
 {
-   static const char* function = "boost::math::expint<%1%>(%1%)";
+   constexpr auto function = "boost::math::expint<%1%>(%1%)";
   if(z < 0)
      return -expint_imp(1, T(-z), pol, tag);
   if(z == 0)
@@ -525,10 +536,10 @@ T expint_i_imp(T z, const Policy& pol, const Tag& tag)
 }

 template <class T, class Policy>
-T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& tag)
+BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant<int, 53>& tag)
 {
   BOOST_MATH_STD_USING
-   static const char* function = "boost::math::expint<%1%>(%1%)";
+   constexpr auto function = "boost::math::expint<%1%>(%1%)";
   if(z < 0)
      return -expint_imp(1, T(-z), pol, tag);
   if(z == 0)
@@ -541,7 +552,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
      // Maximum Deviation Found:                     2.852e-18
      // Expected Error Term:                         2.852e-18
      // Max Error found at double precision =        Poly: 2.636335e-16   Cheb: 4.187027e-16
-      static const T P[10] = {
+      BOOST_MATH_STATIC const T P[10] = {
         BOOST_MATH_BIG_CONSTANT(T, 53, 2.98677224343598593013),
         BOOST_MATH_BIG_CONSTANT(T, 53, 0.356343618769377415068),
         BOOST_MATH_BIG_CONSTANT(T, 53, 0.780836076283730801839),
@@ -553,7 +564,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
         BOOST_MATH_BIG_CONSTANT(T, 53, 0.798296365679269702435e-5),
         BOOST_MATH_BIG_CONSTANT(T, 53, 0.2777056254402008721e-6)
      };
-      static const T Q[8] = {
+      BOOST_MATH_STATIC const T Q[8] = {
         BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
         BOOST_MATH_BIG_CONSTANT(T, 53, -1.17090412365413911947),
         BOOST_MATH_BIG_CONSTANT(T, 53, 0.62215109846016746276),
@@ -564,11 +575,11 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.138972589601781706598e-4)
      };

-      static const T c1 = BOOST_MATH_BIG_CONSTANT(T, 53, 1677624236387711.0);
-      static const T c2 = BOOST_MATH_BIG_CONSTANT(T, 53, 4503599627370496.0);
-      static const T r1 = static_cast<T>(c1 / c2);
-      static const T r2 = BOOST_MATH_BIG_CONSTANT(T, 53, 0.131401834143860282009280387409357165515556574352422001206362e-16);
-      static const T r = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 0.372507410781366634461991866580119133535689497771654051555657435242200120636201854384926049951548942392));
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T c1 = BOOST_MATH_BIG_CONSTANT(T, 53, 1677624236387711.0);
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T c2 = BOOST_MATH_BIG_CONSTANT(T, 53, 4503599627370496.0);
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T r1 = static_cast<T>(c1 / c2);
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T r2 = BOOST_MATH_BIG_CONSTANT(T, 53, 0.131401834143860282009280387409357165515556574352422001206362e-16);
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T r = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 0.372507410781366634461991866580119133535689497771654051555657435242200120636201854384926049951548942392));
      T t = (z / 3) - 1;
      result = tools::evaluate_polynomial(P, t)
         / tools::evaluate_polynomial(Q, t);
@@ -588,8 +599,8 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
      // Maximum Deviation Found:                     6.546e-17
      // Expected Error Term:                         6.546e-17
      // Max Error found at double precision =        Poly: 6.890169e-17   Cheb: 6.772128e-17
-      static const T Y = 1.158985137939453125F;
-      static const T P[8] = {
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.158985137939453125F;
+      BOOST_MATH_STATIC const T P[8] = {
         BOOST_MATH_BIG_CONSTANT(T, 53, 0.00139324086199402804173),
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.0349921221823888744966),
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.0264095520754134848538),
@@ -599,7 +610,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.554086272024881826253e-4),
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.396487648924804510056e-5)
      };
-      static const T Q[8] = {
+      BOOST_MATH_STATIC const T Q[8] = {
         BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
         BOOST_MATH_BIG_CONSTANT(T, 53, 0.744625566823272107711),
         BOOST_MATH_BIG_CONSTANT(T, 53, 0.329061095011767059236),
@@ -621,8 +632,8 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
      // Expected Error Term:                         -1.842e-17
      // Max Error found at double precision =        Poly: 4.375868e-17   Cheb: 5.860967e-17

-      static const T Y = 1.0869731903076171875F;
-      static const T P[9] = {
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.0869731903076171875F;
+      BOOST_MATH_STATIC const T P[9] = {
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.00893891094356945667451),
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.0484607730127134045806),
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.0652810444222236895772),
@@ -633,7 +644,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.000209750022660200888349),
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.138652200349182596186e-4)
      };
-      static const T Q[9] = {
+      BOOST_MATH_STATIC const T Q[9] = {
         BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
         BOOST_MATH_BIG_CONSTANT(T, 53, 1.97017214039061194971),
         BOOST_MATH_BIG_CONSTANT(T, 53, 1.86232465043073157508),
@@ -657,8 +668,8 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
      // Max Error found at double precision =        Poly: 1.441088e-16   Cheb: 1.864792e-16


-      static const T Y = 1.03937530517578125F;
-      static const T P[9] = {
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y = 1.03937530517578125F;
+      BOOST_MATH_STATIC const T P[9] = {
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.00356165148914447597995),
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.0229930320357982333406),
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.0449814350482277917716),
@@ -669,7 +680,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.000192178045857733706044),
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.113161784705911400295e-9)
      };
-      static const T Q[9] = {
+      BOOST_MATH_STATIC const T Q[9] = {
         BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
         BOOST_MATH_BIG_CONSTANT(T, 53, 2.84354408840148561131),
         BOOST_MATH_BIG_CONSTANT(T, 53, 3.6599610090072393012),
@@ -688,9 +699,9 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
   else
   {
      // Max Error found at double precision =        3.381886e-17
-      static const T exp40 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 2.35385266837019985407899910749034804508871617254555467236651e17));
-      static const T Y= 1.013065338134765625F;
-      static const T P[6] = {
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T exp40 = static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 53, 2.35385266837019985407899910749034804508871617254555467236651e17));
+      BOOST_MATH_STATIC_LOCAL_VARIABLE const T Y= 1.013065338134765625F;
+      BOOST_MATH_STATIC const T P[6] = {
         BOOST_MATH_BIG_CONSTANT(T, 53, -0.0130653381347656243849),
         BOOST_MATH_BIG_CONSTANT(T, 53, 0.19029710559486576682),
         BOOST_MATH_BIG_CONSTANT(T, 53, 94.7365094537197236011),
@@ -698,7 +709,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
         BOOST_MATH_BIG_CONSTANT(T, 53, 18932.0850014925993025),
         BOOST_MATH_BIG_CONSTANT(T, 53, -38703.1431362056714134)
      };
-      static const T Q[7] = {
+      BOOST_MATH_STATIC const T Q[7] = {
         BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
         BOOST_MATH_BIG_CONSTANT(T, 53, 61.9733592849439884145),
         BOOST_MATH_BIG_CONSTANT(T, 53, -2354.56211323420194283),
@@ -739,10 +750,10 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 53>& ta
 }

 template <class T, class Policy>
-T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 64>& tag)
+BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant<int, 64>& tag)
 {
   BOOST_MATH_STD_USING
-   static const char* function = "boost::math::expint<%1%>(%1%)";
+   constexpr auto function = "boost::math::expint<%1%>(%1%)";
   if(z < 0)
      return -expint_imp(1, T(-z), pol, tag);
   if(z == 0)
@@ -976,7 +987,7 @@ T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 64>& ta
 }

 template <class T, class Policy>
-void expint_i_imp_113a(T& result, const T& z, const Policy& pol)
+BOOST_MATH_GPU_ENABLED void expint_i_imp_113a(T& result, const T& z, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   // Maximum Deviation Found:                     1.230e-36
@@ -1044,7 +1055,7 @@ void expint_i_imp_113a(T& result, const T& z, const Policy& pol)
 }

 template <class T>
-void expint_i_113b(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113b(T& result, const T& z)
 {
   BOOST_MATH_STD_USING
   // Maximum Deviation Found:                     7.779e-36
@@ -1094,7 +1105,7 @@ void expint_i_113b(T& result, const T& z)
 }

 template <class T>
-void expint_i_113c(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113c(T& result, const T& z)
 {
   BOOST_MATH_STD_USING
   // Maximum Deviation Found:                     1.082e-34
@@ -1147,7 +1158,7 @@ void expint_i_113c(T& result, const T& z)
 }

 template <class T>
-void expint_i_113d(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113d(T& result, const T& z)
 {
   BOOST_MATH_STD_USING
   // Maximum Deviation Found:                     3.163e-35
@@ -1198,7 +1209,7 @@ void expint_i_113d(T& result, const T& z)
 }

 template <class T>
-void expint_i_113e(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113e(T& result, const T& z)
 {
   BOOST_MATH_STD_USING
   // Maximum Deviation Found:                     7.972e-36
@@ -1252,7 +1263,7 @@ void expint_i_113e(T& result, const T& z)
 }

 template <class T>
-void expint_i_113f(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113f(T& result, const T& z)
 {
   BOOST_MATH_STD_USING
   // Maximum Deviation Found:                     4.469e-36
@@ -1299,7 +1310,7 @@ void expint_i_113f(T& result, const T& z)
 }

 template <class T>
-void expint_i_113g(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113g(T& result, const T& z)
 {
   BOOST_MATH_STD_USING
   // Maximum Deviation Found:                     5.588e-35
@@ -1344,7 +1355,7 @@ void expint_i_113g(T& result, const T& z)
 }

 template <class T>
-void expint_i_113h(T& result, const T& z)
+BOOST_MATH_GPU_ENABLED void expint_i_113h(T& result, const T& z)
 {
   BOOST_MATH_STD_USING
   // Maximum Deviation Found:                     4.448e-36
@@ -1383,10 +1394,10 @@ void expint_i_113h(T& result, const T& z)
 }

 template <class T, class Policy>
-T expint_i_imp(T z, const Policy& pol, const std::integral_constant<int, 113>& tag)
+BOOST_MATH_GPU_ENABLED T expint_i_imp(T z, const Policy& pol, const boost::math::integral_constant<int, 113>& tag)
 {
   BOOST_MATH_STD_USING
-   static const char* function = "boost::math::expint<%1%>(%1%)";
+   constexpr auto function = "boost::math::expint<%1%>(%1%)";
   if(z < 0)
      return -expint_imp(1, T(-z), pol, tag);
   if(z == 0)
@@ -1491,12 +1502,12 @@ struct expint_i_initializer
 {
   struct init
   {
-      init()
+      BOOST_MATH_GPU_ENABLED init()
      {
         do_init(tag());
      }
-      static void do_init(const std::integral_constant<int, 0>&){}
-      static void do_init(const std::integral_constant<int, 53>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 0>&){}
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 53>&)
      {
         boost::math::expint(T(5), Policy());
         boost::math::expint(T(7), Policy());
@@ -1504,7 +1515,7 @@ struct expint_i_initializer
         boost::math::expint(T(38), Policy());
         boost::math::expint(T(45), Policy());
      }
-      static void do_init(const std::integral_constant<int, 64>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
      {
         boost::math::expint(T(5), Policy());
         boost::math::expint(T(7), Policy());
@@ -1512,7 +1523,7 @@ struct expint_i_initializer
         boost::math::expint(T(38), Policy());
         boost::math::expint(T(45), Policy());
      }
-      static void do_init(const std::integral_constant<int, 113>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
      {
         boost::math::expint(T(5), Policy());
         boost::math::expint(T(7), Policy());
@@ -1524,12 +1535,14 @@ struct expint_i_initializer
         boost::math::expint(T(200), Policy());
         boost::math::expint(T(220), Policy());
      }
-      void force_instantiate()const{}
+      BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
   };
   static const init initializer;
-   static void force_instantiate()
+   BOOST_MATH_GPU_ENABLED static void force_instantiate()
   {
+      #ifndef BOOST_MATH_HAS_GPU_SUPPORT
      initializer.force_instantiate();
+      #endif
   }
 };

@@ -1541,33 +1554,35 @@ struct expint_1_initializer
 {
   struct init
   {
-      init()
+      BOOST_MATH_GPU_ENABLED init()
      {
         do_init(tag());
      }
-      static void do_init(const std::integral_constant<int, 0>&){}
-      static void do_init(const std::integral_constant<int, 53>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 0>&){}
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 53>&)
      {
         boost::math::expint(1, T(0.5), Policy());
         boost::math::expint(1, T(2), Policy());
      }
-      static void do_init(const std::integral_constant<int, 64>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
      {
         boost::math::expint(1, T(0.5), Policy());
         boost::math::expint(1, T(2), Policy());
      }
-      static void do_init(const std::integral_constant<int, 113>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
      {
         boost::math::expint(1, T(0.5), Policy());
         boost::math::expint(1, T(2), Policy());
         boost::math::expint(1, T(6), Policy());
      }
-      void force_instantiate()const{}
+      BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
   };
   static const init initializer;
-   static void force_instantiate()
+   BOOST_MATH_GPU_ENABLED static void force_instantiate()
   {
+      #ifndef BOOST_MATH_HAS_GPU_SUPPORT
      initializer.force_instantiate();
+      #endif
   }
 };

@@ -1575,8 +1590,8 @@ template <class T, class Policy, class tag>
 const typename expint_1_initializer<T, Policy, tag>::init expint_1_initializer<T, Policy, tag>::initializer;

 template <class T, class Policy>
-inline typename tools::promote_args<T>::type
-   expint_forwarder(T z, const Policy& /*pol*/, std::true_type const&)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
+   expint_forwarder(T z, const Policy& /*pol*/, boost::math::true_type const&)
 {
   typedef typename tools::promote_args<T>::type result_type;
   typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -1587,7 +1602,7 @@ inline typename tools::promote_args<T>::type
      policies::promote_double<false>,
      policies::discrete_quantile<>,
      policies::assert_undefined<> >::type forwarding_policy;
-   typedef std::integral_constant<int,
+   typedef boost::math::integral_constant<int,
      precision_type::value <= 0 ? 0 :
      precision_type::value <= 53 ? 53 :
      precision_type::value <= 64 ? 64 :
@@ -1603,8 +1618,8 @@ inline typename tools::promote_args<T>::type
 }

 template <class T>
-inline typename tools::promote_args<T>::type
-expint_forwarder(unsigned n, T z, const std::false_type&)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
+expint_forwarder(unsigned n, T z, const boost::math::false_type&)
 {
   return boost::math::expint(n, z, policies::policy<>());
 }
@@ -1612,7 +1627,7 @@ expint_forwarder(unsigned n, T z, const std::false_type&)
 } // namespace detail

 template <class T, class Policy>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
   expint(unsigned n, T z, const Policy& /*pol*/)
 {
   typedef typename tools::promote_args<T>::type result_type;
@@ -1624,7 +1639,7 @@ inline typename tools::promote_args<T>::type
      policies::promote_double<false>,
      policies::discrete_quantile<>,
      policies::assert_undefined<> >::type forwarding_policy;
-   typedef std::integral_constant<int,
+   typedef boost::math::integral_constant<int,
      precision_type::value <= 0 ? 0 :
      precision_type::value <= 53 ? 53 :
      precision_type::value <= 64 ? 64 :
@@ -1641,7 +1656,7 @@ inline typename tools::promote_args<T>::type
 }

 template <class T, class U>
-inline typename detail::expint_result<T, U>::type
+BOOST_MATH_GPU_ENABLED inline typename detail::expint_result<T, U>::type
   expint(T const z, U const u)
 {
   typedef typename policies::is_policy<U>::type tag_type;
@@ -1649,7 +1664,7 @@ inline typename detail::expint_result<T, U>::type
 }

 template <class T>
-inline typename tools::promote_args<T>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type
   expint(T z)
 {
   return expint(z, policies::policy<>());
--- a/include/boost/math/special_functions/gegenbauer.hpp
+++ b/include/boost/math/special_functions/gegenbauer.hpp
@@ -1,4 +1,5 @@
 //  (C) Copyright Nick Thompson 2019.
+//  (C) Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -6,21 +7,25 @@
 #ifndef BOOST_MATH_SPECIAL_GEGENBAUER_HPP
 #define BOOST_MATH_SPECIAL_GEGENBAUER_HPP

-#include <limits>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+
+#ifndef BOOST_MATH_NO_EXCEPTIONS
 #include <stdexcept>
-#include <type_traits>
+#endif

 namespace boost { namespace math {

 template<typename Real>
-Real gegenbauer(unsigned n, Real lambda, Real x)
+BOOST_MATH_GPU_ENABLED Real gegenbauer(unsigned n, Real lambda, Real x)
 {
-    static_assert(!std::is_integral<Real>::value, "Gegenbauer polynomials required floating point arguments.");
+    static_assert(!boost::math::is_integral<Real>::value, "Gegenbauer polynomials required floating point arguments.");
    if (lambda <= -1/Real(2)) {
 #ifndef BOOST_MATH_NO_EXCEPTIONS
       throw std::domain_error("lambda > -1/2 is required.");
 #else
-       return std::numeric_limits<Real>::quiet_NaN();
+       return boost::math::numeric_limits<Real>::quiet_NaN();
 #endif
    }
    // The only reason to do this is because of some instability that could be present for x < 0 that is not present for x > 0.
@@ -41,7 +46,7 @@ Real gegenbauer(unsigned n, Real lambda, Real x)

    Real yk = y1;
    Real k = 2;
-    Real k_max = n*(1+std::numeric_limits<Real>::epsilon());
+    Real k_max = n*(1+boost::math::numeric_limits<Real>::epsilon());
    Real gamma = 2*(lambda - 1);
    while(k < k_max)
    {
@@ -55,7 +60,7 @@ Real gegenbauer(unsigned n, Real lambda, Real x)


 template<typename Real>
-Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k)
+BOOST_MATH_GPU_ENABLED Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k)
 {
    if (k > n) {
        return Real(0);
@@ -70,7 +75,7 @@ Real gegenbauer_derivative(unsigned n, Real lambda, Real x, unsigned k)
 }

 template<typename Real>
-Real gegenbauer_prime(unsigned n, Real lambda, Real x) {
+BOOST_MATH_GPU_ENABLED Real gegenbauer_prime(unsigned n, Real lambda, Real x) {
    return gegenbauer_derivative<Real>(n, lambda, x, 1);
 }

--- a/include/boost/math/special_functions/hankel.hpp
+++ b/include/boost/math/special_functions/hankel.hpp
@@ -1,4 +1,5 @@
 // Copyright John Maddock 2012.
+// Copyright Matt Borland 2024.
 // Use, modification and distribution are subject to the
 // Boost Software License, Version 1.0.
 // (See accompanying file LICENSE_1_0.txt
@@ -7,26 +8,31 @@
 #ifndef BOOST_MATH_HANKEL_HPP
 #define BOOST_MATH_HANKEL_HPP

+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/complex.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/detail/iconv.hpp>
+#include <boost/math/constants/constants.hpp>
+#include <boost/math/policies/error_handling.hpp>

 namespace boost{ namespace math{

 namespace detail{

 template <class T, class Policy>
-std::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign)
+BOOST_MATH_GPU_ENABLED boost::math::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol, int sign)
 {
   BOOST_MATH_STD_USING
-   static const char* function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)";
+   constexpr auto function = "boost::math::cyl_hankel_1<%1%>(%1%,%1%)";

   if(x < 0)
   {
      bool isint_v = floor(v) == v;
      T j, y;
      bessel_jy(v, -x, &j, &y, need_j | need_y, pol);
-      std::complex<T> cx(x), cv(v);
-      std::complex<T> j_result, y_result;
+      boost::math::complex<T> cx(x), cv(v);
+      boost::math::complex<T> j_result, y_result;
      if(isint_v)
      {
         int s = (iround(v) & 1) ? -1 : 1;
@@ -37,12 +43,12 @@ std::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol
      {
         j_result = pow(cx, v) * pow(-cx, -v) * j;
         T p1 = pow(-x, v);
-         std::complex<T> p2 = pow(cx, v);
+         boost::math::complex<T> p2 = pow(cx, v);
         y_result = p1 * y / p2
            + (p2 / p1 - p1 / p2) * j / tan(constants::pi<T>() * v);
      }
      // multiply y_result by i:
-      y_result = std::complex<T>(-sign * y_result.imag(), sign * y_result.real());
+      y_result = boost::math::complex<T>(-sign * y_result.imag(), sign * y_result.real());
      return j_result + y_result;
   }

@@ -51,25 +57,25 @@ std::complex<T> hankel_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol
      if(v == 0)
      {
         // J is 1, Y is -INF
-         return std::complex<T>(1, sign * -policies::raise_overflow_error<T>(function, nullptr, pol));
+         return boost::math::complex<T>(1, sign * -policies::raise_overflow_error<T>(function, nullptr, pol));
      }
      else
      {
         // At least one of J and Y is complex infinity:
-         return std::complex<T>(policies::raise_overflow_error<T>(function, nullptr, pol), sign * policies::raise_overflow_error<T>(function, nullptr, pol));
+         return boost::math::complex<T>(policies::raise_overflow_error<T>(function, nullptr, pol), sign * policies::raise_overflow_error<T>(function, nullptr, pol));
      }
   }

   T j, y;
   bessel_jy(v, x, &j, &y, need_j | need_y, pol);
-   return std::complex<T>(j, sign * y);
+   return boost::math::complex<T>(j, sign * y);
 }

 template <class T, class Policy>
-std::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign);
+BOOST_MATH_GPU_ENABLED boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign);

 template <class T, class Policy>
-inline std::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol, int sign)
 {
   BOOST_MATH_STD_USING  // ADL of std names.
   int ival = detail::iconv(v, pol);
@@ -81,57 +87,57 @@ inline std::complex<T> hankel_imp(T v, T x, const bessel_maybe_int_tag&, const P
 }

 template <class T, class Policy>
-inline std::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<T> hankel_imp(int v, T x, const bessel_int_tag&, const Policy& pol, int sign)
 {
   BOOST_MATH_STD_USING
-   if((std::abs(v) < 200) && (x > 0))
-      return std::complex<T>(bessel_jn(v, x, pol), sign * bessel_yn(v, x, pol));
+   if((abs(v) < 200) && (x > 0))
+      return boost::math::complex<T>(bessel_jn(v, x, pol), sign * bessel_yn(v, x, pol));
   return hankel_imp(static_cast<T>(v), x, bessel_no_int_tag(), pol, sign);
 }

 template <class T, class Policy>
-inline std::complex<T> sph_hankel_imp(T v, T x, const Policy& pol, int sign)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<T> sph_hankel_imp(T v, T x, const Policy& pol, int sign)
 {
   BOOST_MATH_STD_USING
-   return constants::root_half_pi<T>() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(std::complex<T>(x));
+   return constants::root_half_pi<T>() * hankel_imp(v + 0.5f, x, bessel_no_int_tag(), pol, sign) / sqrt(boost::math::complex<T>(x));
 }

 } // namespace detail

 template <class T1, class T2, class Policy>
-inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
   typedef typename detail::bessel_traits<T1, T2, Policy>::optimisation_tag tag_type;
   typedef typename policies::evaluation<result_type, Policy>::type value_type;
-   return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, 1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
+   return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, 1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
 }

 template <class T1, class T2>
-inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x)
 {
   return cyl_hankel_1(v, x, policies::policy<>());
 }

 template <class T1, class T2, class Policy>
-inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
   typedef typename detail::bessel_traits<T1, T2, Policy>::optimisation_tag tag_type;
   typedef typename policies::evaluation<result_type, Policy>::type value_type;
-   return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, -1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
+   return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::hankel_imp<value_type>(v, static_cast<value_type>(x), tag_type(), pol, -1), "boost::math::cyl_hankel_1<%1%>(%1%,%1%)");
 }

 template <class T1, class T2>
-inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x)
 {
   return cyl_hankel_2(v, x, policies::policy<>());
 }

 template <class T1, class T2, class Policy>
-inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy&)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy&)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -143,17 +149,17 @@ inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type>
      policies::discrete_quantile<>,
      policies::assert_undefined<> >::type forwarding_policy;

-   return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), 1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
+   return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), 1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
 }

 template <class T1, class T2>
-inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x)
 {
   return sph_hankel_1(v, x, policies::policy<>());
 }

 template <class T1, class T2, class Policy>
-inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy&)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy&)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -165,11 +171,11 @@ inline std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type>
      policies::discrete_quantile<>,
      policies::assert_undefined<> >::type forwarding_policy;

-   return policies::checked_narrowing_cast<std::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), -1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
+   return policies::checked_narrowing_cast<boost::math::complex<result_type>, Policy>(detail::sph_hankel_imp<value_type>(static_cast<value_type>(v), static_cast<value_type>(x), forwarding_policy(), -1), "boost::math::sph_hankel_1<%1%>(%1%,%1%)");
 }

 template <class T1, class T2>
-inline std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x)
 {
   return sph_hankel_2(v, x, policies::policy<>());
 }
--- a/include/boost/math/special_functions/hermite.hpp
+++ b/include/boost/math/special_functions/hermite.hpp
@@ -1,5 +1,6 @@

 //  (C) Copyright John Maddock 2006.
+//  (C) Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -11,8 +12,9 @@
 #pragma once
 #endif

-#include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/config.hpp>
+#include <boost/math/tools/promotion.hpp>
+#include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/policies/error_handling.hpp>

 namespace boost{
@@ -20,7 +22,7 @@ namespace math{

 // Recurrence relation for Hermite polynomials:
 template <class T1, class T2, class T3>
-inline typename tools::promote_args<T1, T2, T3>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2, T3>::type 
   hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1)
 {
   using promoted_type = tools::promote_args_t<T1, T2, T3>;
@@ -31,7 +33,7 @@ namespace detail{

 // Implement Hermite polynomials via recurrence:
 template <class T>
-T hermite_imp(unsigned n, T x)
+BOOST_MATH_GPU_ENABLED T hermite_imp(unsigned n, T x)
 {
   T p0 = 1;
   T p1 = 2 * x;
@@ -43,7 +45,7 @@ T hermite_imp(unsigned n, T x)

   while(c < n)
   {
-      std::swap(p0, p1);
+      BOOST_MATH_GPU_SAFE_SWAP(p0, p1);
      p1 = static_cast<T>(hermite_next(c, x, p0, p1));
      ++c;
   }
@@ -53,7 +55,7 @@ T hermite_imp(unsigned n, T x)
 } // namespace detail

 template <class T, class Policy>
-inline typename tools::promote_args<T>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type 
   hermite(unsigned n, T x, const Policy&)
 {
   typedef typename tools::promote_args<T>::type result_type;
@@ -62,7 +64,7 @@ inline typename tools::promote_args<T>::type
 }

 template <class T>
-inline typename tools::promote_args<T>::type 
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type 
   hermite(unsigned n, T x)
 {
   return boost::math::hermite(n, x, policies::policy<>());
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -27,6 +27,7 @@
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/promotion.hpp> // for argument promotion.
 #include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/complex.hpp>
 #include <boost/math/policies/policy.hpp>

 #ifdef BOOST_MATH_HAS_NVRTC
@@ -50,7 +51,53 @@ namespace detail{
      >::type;
   };

-   } // namespace detail
+   template <class T, class U>
+   struct expint_result
+   {
+      using type = typename boost::math::conditional<
+         policies::is_policy<U>::value,
+         tools::promote_args_t<T>,
+         typename tools::promote_args<U>::type
+      >::type;
+   };
+
+   typedef boost::math::integral_constant<int, 0> bessel_no_int_tag;      // No integer optimisation possible.
+   typedef boost::math::integral_constant<int, 1> bessel_maybe_int_tag;   // Maybe integer optimisation.
+   typedef boost::math::integral_constant<int, 2> bessel_int_tag;         // Definite integer optimisation.
+
+   template <class T1, class T2, class Policy>
+   struct bessel_traits
+   {
+      using result_type = typename boost::math::conditional<
+         boost::math::is_integral<T1>::value,
+         typename tools::promote_args<T2>::type,
+         tools::promote_args_t<T1, T2>
+      >::type;
+
+      typedef typename policies::precision<result_type, Policy>::type precision_type;
+
+      using optimisation_tag = typename boost::math::conditional<
+         (precision_type::value <= 0 || precision_type::value > 64),
+         bessel_no_int_tag,
+         typename boost::math::conditional<
+            boost::math::is_integral<T1>::value,
+            bessel_int_tag,
+            bessel_maybe_int_tag
+         >::type
+      >::type;
+
+      using optimisation_tag128 = typename boost::math::conditional<
+         (precision_type::value <= 0 || precision_type::value > 113),
+         bessel_no_int_tag,
+         typename boost::math::conditional<
+            boost::math::is_integral<T1>::value,
+            bessel_int_tag,
+            bessel_maybe_int_tag
+         >::type
+      >::type;
+   };
+
+} // namespace detail

 } // namespace math
 } // namespace boost
@@ -284,15 +331,15 @@ namespace boost
      laguerre(unsigned n, T1 m, T2 x);

   template <class T>
-   tools::promote_args_t<T>
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T>
      hermite(unsigned n, T x);

   template <class T, class Policy>
-   tools::promote_args_t<T>
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T>
      hermite(unsigned n, T x, const Policy& pol);

   template <class T1, class T2, class T3>
-   tools::promote_args_t<T1, T2, T3>
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2, T3>
      hermite_next(unsigned n, T1 x, T2 Hn, T3 Hnm1);

   template<class T1, class T2, class T3>
@@ -808,28 +855,28 @@ namespace boost
                         const Policy&);

   template <class T1, class T2>
-   std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_1(T1 v, T2 x);

   template <class T1, class T2, class Policy>
-   std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_1(T1 v, T2 x, const Policy& pol);

   template <class T1, class T2, class Policy>
-   std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> cyl_hankel_2(T1 v, T2 x, const Policy& pol);

   template <class T1, class T2>
-   std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> cyl_hankel_2(T1 v, T2 x);

   template <class T1, class T2, class Policy>
-   std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_1(T1 v, T2 x, const Policy& pol);

   template <class T1, class T2>
-   std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_1(T1 v, T2 x);

   template <class T1, class T2, class Policy>
-   std::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, Policy>::result_type> sph_hankel_2(T1 v, T2 x, const Policy& pol);

   template <class T1, class T2>
-   std::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED boost::math::complex<typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type> sph_hankel_2(T1 v, T2 x);

   template <class T, class Policy>
   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> airy_ai(T x, const Policy&);
@@ -936,7 +983,7 @@ namespace boost
   template <class T, class U>
   struct expint_result
   {
-      typedef typename std::conditional<
+      typedef typename boost::math::conditional<
         policies::is_policy<U>::value,
         tools::promote_args_t<T>,
         typename tools::promote_args<U>::type
@@ -946,13 +993,13 @@ namespace boost
   } // namespace detail

   template <class T, class Policy>
-   tools::promote_args_t<T> expint(unsigned n, T z, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> expint(unsigned n, T z, const Policy&);

   template <class T, class U>
-   typename detail::expint_result<T, U>::type expint(T const z, U const u);
+   BOOST_MATH_GPU_ENABLED typename detail::expint_result<T, U>::type expint(T const z, U const u);

   template <class T>
-   tools::promote_args_t<T> expint(T z);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> expint(T z);

   // Zeta:
   template <class T, class Policy>
@@ -1346,7 +1393,7 @@ namespace boost
   laguerre(unsigned n, T1 m, T2 x) { return ::boost::math::laguerre(n, m, x, Policy()); }\
 \
   template <class T>\
-   inline boost::math::tools::promote_args_t<T> \
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> \
   hermite(unsigned n, T x){ return ::boost::math::hermite(n, x, Policy()); }\
 \
   using boost::math::hermite_next;\
@@ -1620,11 +1667,11 @@ template <class OutputIterator, class T>\
   using boost::math::changesign;\
   \
   template <class T, class U>\
-   inline typename boost::math::tools::promote_args_t<T,U> expint(T const& z, U const& u)\
+   BOOST_MATH_GPU_ENABLED inline typename boost::math::tools::promote_args_t<T,U> expint(T const& z, U const& u)\
   { return boost::math::expint(z, u, Policy()); }\
   \
   template <class T>\
-   inline boost::math::tools::promote_args_t<T> expint(T z){ return boost::math::expint(z, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> expint(T z){ return boost::math::expint(z, Policy()); }\
   \
   template <class T>\
   inline boost::math::tools::promote_args_t<T> zeta(T s){ return boost::math::zeta(s, Policy()); }\
@@ -1669,19 +1716,19 @@ template <class OutputIterator, class T>\
   inline boost::math::tools::promote_args_t<RT1, RT2> owens_t(RT1 a, RT2 z){ return boost::math::owens_t(a, z, Policy()); }\
   \
   template <class T1, class T2>\
-   inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_1(T1 v, T2 x)\
+   inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_1(T1 v, T2 x)\
   {  return boost::math::cyl_hankel_1(v, x, Policy()); }\
   \
   template <class T1, class T2>\
-   inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_2(T1 v, T2 x)\
+   inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> cyl_hankel_2(T1 v, T2 x)\
   { return boost::math::cyl_hankel_2(v, x, Policy()); }\
   \
   template <class T1, class T2>\
-   inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_1(T1 v, T2 x)\
+   inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_1(T1 v, T2 x)\
   { return boost::math::sph_hankel_1(v, x, Policy()); }\
   \
   template <class T1, class T2>\
-   inline std::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_2(T1 v, T2 x)\
+   inline BOOST_MATH_GPU_ENABLED boost::math::complex<typename boost::math::detail::bessel_traits<T1, T2, Policy >::result_type> sph_hankel_2(T1 v, T2 x)\
   { return boost::math::sph_hankel_2(v, x, Policy()); }\
   \
   template <class T>\
--- a/include/boost/math/tools/complex.hpp
+++ b/include/boost/math/tools/complex.hpp
@@ -14,11 +14,93 @@
 #include <boost/math/tools/is_detected.hpp>

 #ifdef BOOST_MATH_ENABLE_CUDA
-#include <cuda/std/utility>
-#endif

-#ifndef BOOST_MATH_HAS_NVRTC
+#include <cuda/std/utility>
+#include <cuda/std/complex>
+
+namespace boost {
+namespace math {
+
+template <typename T>
+using complex = cuda::std::complex<T>;
+
+using cuda::std::real;
+using cuda::std::imag;
+using cuda::std::abs;
+using cuda::std::arg;
+using cuda::std::norm;
+using cuda::std::conj;
+using cuda::std::proj;
+using cuda::std::polar;
+
+using cuda::std::exp;
+using cuda::std::log;
+using cuda::std::log10;
+
+using cuda::std::pow;
+using cuda::std::sqrt;
+
+using cuda::std::sin;
+using cuda::std::cos;
+using cuda::std::tan;
+using cuda::std::asin;
+using cuda::std::acos;
+using cuda::std::atan;
+
+using cuda::std::sinh;
+using cuda::std::cosh;
+using cuda::std::tanh;
+using cuda::std::asinh;
+using cuda::std::acosh;
+using cuda::std::atanh;
+
+} // namespace math
+} // namespace boost
+
+#else
+
 #include <utility>
+#include <complex>
+
+namespace boost {
+namespace math {
+
+template <typename T>
+using complex = std::complex<T>;
+
+using std::real;
+using std::imag;
+using std::abs;
+using std::arg;
+using std::norm;
+using std::conj;
+using std::proj;
+using std::polar;
+
+using std::exp;
+using std::log;
+using std::log10;
+
+using std::pow;
+using std::sqrt;
+
+using std::sin;
+using std::cos;
+using std::tan;
+using std::asin;
+using std::acos;
+using std::atan;
+
+using std::sinh;
+using std::cosh;
+using std::tanh;
+using std::asinh;
+using std::acosh;
+using std::atanh;
+
+} // namespace math
+} // namespace boost
+
 #endif

 namespace boost {
--- a/include/boost/math/tools/config.hpp
+++ b/include/boost/math/tools/config.hpp
@@ -678,6 +678,7 @@ namespace boost{ namespace math{
 #include <cuda/std/cstdint>
 #include <cuda/std/array>
 #include <cuda/std/tuple>
+#include <cuda/std/complex>

 #  define BOOST_MATH_CUDA_ENABLED __host__ __device__
 #  define BOOST_MATH_HAS_GPU_SUPPORT
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -304,6 +304,14 @@ run test_cyl_neumann_double.cu ;
 run test_cyl_neumann_float.cu ;
 run test_sph_neumann_double.cu ;
 run test_sph_neumann_float.cu ;
+run test_cyl_hankel_1_double.cu ;
+run test_cyl_hankel_1_float.cu ;
+run test_cyl_hankel_2_double.cu ;
+run test_cyl_hankel_2_float.cu ;
+run test_sph_hankel_1_double.cu ;
+run test_sph_hankel_1_float.cu ;
+run test_sph_hankel_2_double.cu ;
+run test_sph_hankel_2_float.cu ;

 run test_cbrt_double.cu ;
 run test_cbrt_float.cu ;
@@ -339,9 +347,18 @@ run test_erfc_float.cu ;
 run test_erfc_inv_double.cu ;
 run test_erfc_inv_float.cu ;

+run test_expint_double.cu ;
+run test_expint_float.cu ;
+
 run test_expm1_double.cu ;
 run test_expm1_float.cu ;

+run test_gegenbauer_double.cu ;
+run test_gegenbauer_float.cu ;
+
+run test_hermite_double.cu ;
+run test_hermite_float.cu ;
+
 run test_lgamma_double.cu ;
 run test_lgamma_float.cu ;
 run test_tgamma_double.cu ;
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -9,9 +9,6 @@ project : requirements
    [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
    ;

-run test_heumann_lambda_nvrtc_double.cpp ;
-run test_heumann_lambda_nvrtc_float.cpp ;
-
 # Quad
 run test_exp_sinh_quad_nvrtc_float.cpp ;
 run test_exp_sinh_quad_nvrtc_double.cpp ;
@@ -306,6 +303,14 @@ run test_cyl_neumann_nvrtc_double.cpp ;
 run test_cyl_neumann_nvrtc_float.cpp ;
 run test_sph_neumann_nvrtc_double.cpp ;
 run test_sph_neumann_nvrtc_float.cpp ;
+run test_cyl_hankel_1_nvrtc_double.cpp ;
+run test_cyl_hankel_1_nvrtc_float.cpp ;
+run test_cyl_hankel_2_nvrtc_double.cpp ;
+run test_cyl_hankel_2_nvrtc_float.cpp ;
+run test_sph_hankel_1_nvrtc_double.cpp ;
+run test_sph_hankel_1_nvrtc_float.cpp ;
+run test_sph_hankel_2_nvrtc_double.cpp ;
+run test_sph_hankel_2_nvrtc_float.cpp ;

 run test_cbrt_nvrtc_double.cpp ;
 run test_cbrt_nvrtc_float.cpp ;
@@ -335,6 +340,11 @@ run test_ellint_d_nvrtc_double.cpp ;
 run test_ellint_d_nvrtc_float.cpp ;
 run test_jacobi_zeta_nvrtc_double.cpp ;
 run test_jacobi_zeta_nvrtc_float.cpp ;
+run test_heumann_lambda_nvrtc_double.cpp ;
+run test_heumann_lambda_nvrtc_float.cpp ;
+
+run test_expint_nvrtc_double.cpp ;
+run test_expint_nvrtc_float.cpp ;

 run test_expm1_nvrtc_double.cpp ;
 run test_expm1_nvrtc_float.cpp ;
@@ -351,6 +361,12 @@ run test_gamma_p_inv_nvrtc_float.cpp ;
 run test_tgamma_ratio_nvrtc_double.cpp ;
 run test_tgamma_ratio_nvrtc_float.cpp ;

+run test_gegenbauer_nvrtc_double.cpp ;
+run test_gegenbauer_nvrtc_float.cpp ;
+
+run test_hermite_nvrtc_double.cpp ;
+run test_hermite_nvrtc_float.cpp ;
+
 run test_log1p_nvrtc_double.cpp ;
 run test_log1p_nvrtc_float.cpp ;

--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -71,8 +71,14 @@ run test_sign.cpp ;

 run test_round.cpp ;

+run test_expint.cpp ;
+
 run test_expm1_simple.cpp ;

+run gegenbauer_test.cpp ;
+
+run test_hankel.cpp ;
+
 run test_log1p_simple.cpp ;

 run test_digamma_simple.cpp ;
@@ -85,3 +91,5 @@ run test_gamma.cpp ;
 run test_igamma.cpp ;
 run test_igamma_inv.cpp ;
 run test_igamma_inva.cpp ;
+
+run test_hermite.cpp ;
--- a/test/test_cyl_hankel_1_double.cu
+++ b/test/test_cyl_hankel_1_double.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::cyl_hankel_1(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_hankel_1_float.cu
+++ b/test/test_cyl_hankel_1_float.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::cyl_hankel_1(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_hankel_1_nvrtc_double.cpp
+++ b/test/test_cyl_hankel_1_nvrtc_double.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_cyl_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_hankel_1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_hankel_1(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_hankel_1_nvrtc_float.cpp
+++ b/test/test_cyl_hankel_1_nvrtc_float.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_cyl_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_1(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_hankel_1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_hankel_1(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_hankel_2_double.cu
+++ b/test/test_cyl_hankel_2_double.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::cyl_hankel_2(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_hankel_2_float.cu
+++ b/test/test_cyl_hankel_2_float.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::cyl_hankel_2(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_hankel_2_nvrtc_double.cpp
+++ b/test/test_cyl_hankel_2_nvrtc_double.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_cyl_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_2_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_hankel_2_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_2_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_hankel_2(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_hankel_2_nvrtc_float.cpp
+++ b/test/test_cyl_hankel_2_nvrtc_float.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_cyl_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_hankel_2(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_hankel_2_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_hankel_2_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_hankel_2_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_hankel_2(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_expint.cpp
+++ b/test/test_expint.cpp
@@ -3,7 +3,14 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif
+
+#ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#endif
+
 #include "test_expint.hpp"

 //
@@ -78,7 +85,11 @@ void expected_results()
      ".*",                          // platform
      "float|double|long double",    // test type(s)
      ".*Ei.*",                      // test data group
+      #ifndef SYCL_LANGUAGE_VERSION
      ".*", 6, 3);                   // test function
+      #else
+      ".*", 10, 3);
+      #endif
   if(std::numeric_limits<long double>::digits > 100)
   {
      add_expected_result(
--- a/test/test_expint.hpp
+++ b/test/test_expint.hpp
@@ -4,13 +4,19 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp>
+#endif
+
 #include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/special_functions/expint.hpp>
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
 #include <boost/math/tools/stats.hpp>
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
--- a/test/test_expint_double.cu
+++ b/test/test_expint_double.cu
@@ -0,0 +1,106 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::expint(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    std::mt19937_64 rng(42);
+    std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::expint(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                return EXIT_FAILURE;
+            }
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_expint_float.cu
+++ b/test/test_expint_float.cu
@@ -0,0 +1,106 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::expint(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    std::mt19937_64 rng(42);
+    std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::expint(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+            {
+                std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+                return EXIT_FAILURE;
+            }
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_expint_nvrtc_double.cpp
+++ b/test/test_expint_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/expint.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/expint.hpp>
+extern "C" __global__ 
+void test_expint_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::expint(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_expint_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_expint_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_expint_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::expint(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_expint_nvrtc_float.cpp
+++ b/test/test_expint_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/expint.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/expint.hpp>
+extern "C" __global__ 
+void test_expint_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::expint(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_expint_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_expint_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_expint_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::expint(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_gegenbauer_double.cu
+++ b/test/test_gegenbauer_double.cu
@@ -0,0 +1,125 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+//  Gegenbauer prime uses all methods internally so it's the easy choice
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include <boost/math/special_functions/gegenbauer.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::gegenbauer_prime(2, in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::gegenbauer_prime(2, input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]);
+            // Most elements are under 50 but extremely small numbers very more greatly
+            if (eps > 1000)
+            {
+                std::cerr << "Result verification failed at element " << i << "!\n"
+                          << "Device: " << output_vector[i]
+                          << "\n  Host: " << results[i]
+                          << "\n   Eps: " << eps << std::endl;
+                ++failure_counter;
+                if (failure_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_gegenbauer_float.cu
+++ b/test/test_gegenbauer_float.cu
@@ -0,0 +1,124 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+//  Gegenbauer prime uses all methods internally so it's the easy choice
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include <boost/math/special_functions/gegenbauer.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::gegenbauer_prime(2, in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::gegenbauer_prime(2, input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            const auto eps = boost::math::epsilon_difference(output_vector[i], results[i]);
+            if (eps > 1000)
+            {
+                std::cerr << "Result verification failed at element " << i << "!\n"
+                          << "Device: " << output_vector[i]
+                          << "\n  Host: " << results[i]
+                          << "\n   Eps: " << eps << std::endl;
+                ++failure_counter;
+                if (failure_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_gegenbauer_nvrtc_double.cpp
+++ b/test/test_gegenbauer_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/gegenbauer.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/gegenbauer.hpp>
+extern "C" __global__ 
+void test_gegenbauer_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::gegenbauer(2, in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gegenbauer_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gegenbauer_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gegenbauer_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::gegenbauer(2, h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_gegenbauer_nvrtc_float.cpp
+++ b/test/test_gegenbauer_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/gegenbauer.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/gegenbauer.hpp>
+extern "C" __global__ 
+void test_gegenbauer_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::gegenbauer(2, in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gegenbauer_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gegenbauer_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gegenbauer_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::gegenbauer(2, h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_hankel.cpp
+++ b/test/test_hankel.cpp
@@ -3,9 +3,13 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#endif

 #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#include <boost/math/tools/config.hpp>
+
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
@@ -85,6 +89,7 @@ void test_hankel(T, const char* name)
 //
 // Instantiate a few instances to check our error handling code can cope with std::complex:
 //
+#ifndef SYCL_LANGUAGE_VERSION
 typedef boost::math::policies::policy<
   boost::math::policies::overflow_error<boost::math::policies::throw_on_error>,
   boost::math::policies::denorm_error<boost::math::policies::throw_on_error>,
@@ -120,7 +125,7 @@ typedef boost::math::policies::policy<
   boost::math::policies::indeterminate_result_error<boost::math::policies::ignore_error> > pol3;

 template std::complex<double> boost::math::cyl_hankel_1<double, double, pol3>(double, double, const pol3&);
-
+#endif

 BOOST_AUTO_TEST_CASE( test_main )
 {
--- a/test/test_hermite.cpp
+++ b/test/test_hermite.cpp
@@ -5,8 +5,11 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
-#include"test_hermite.hpp"
+#endif
+
+#include "test_hermite.hpp"

 //
 // DESCRIPTION:
--- a/test/test_hermite.hpp
+++ b/test/test_hermite.hpp
@@ -11,11 +11,17 @@
 // Constants are too big for float case, but this doesn't matter for test.
 #endif

+#include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp>
+#endif
+
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/special_functions/hermite.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/array.hpp>
 #include "functor.hpp"
--- a/test/test_hermite_double.cu
+++ b/test/test_hermite_double.cu
@@ -0,0 +1,120 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::hermite(1U, in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::hermite(1U, input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    int fail_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000)
+            {
+                std::cerr << "Result verification failed at element " << i << "!\n"
+                          << "Device: " << output_vector[i] << '\n'
+                          << "  Host: " << results[i] << '\n'
+                          << "   Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
+                fail_counter++;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (fail_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_hermite_float.cu
+++ b/test/test_hermite_float.cu
@@ -0,0 +1,120 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::hermite(1U, in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::hermite(1U, input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    int fail_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000)
+            {
+                std::cerr << "Result verification failed at element " << i << "!\n"
+                          << "Device: " << output_vector[i] << '\n'
+                          << "  Host: " << results[i] << '\n'
+                          << "   Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
+                fail_counter++;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (fail_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_hermite_nvrtc_double.cpp
+++ b/test/test_hermite_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/hermite.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hermite.hpp>
+extern "C" __global__ 
+void test_hermite_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::hermite(1U, in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_hermite_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_hermite_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_hermite_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::hermite(1U, h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_hermite_nvrtc_float.cpp
+++ b/test/test_hermite_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/hermite.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hermite.hpp>
+extern "C" __global__ 
+void test_hermite_kernel(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::hermite(1U, in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_hermite_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_hermite_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_hermite_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::hermite(1U, h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_sph_hankel_1_double.cu
+++ b/test/test_sph_hankel_1_double.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::sph_hankel_1(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_sph_hankel_1_float.cu
+++ b/test/test_sph_hankel_1_float.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::sph_hankel_1(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_sph_hankel_1_nvrtc_double.cpp
+++ b/test/test_sph_hankel_1_nvrtc_double.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_sph_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_sph_hankel_1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::sph_hankel_1(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_sph_hankel_1_nvrtc_float.cpp
+++ b/test/test_sph_hankel_1_nvrtc_float.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_sph_hankel_1_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_1(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_sph_hankel_1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::sph_hankel_1(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_sph_hankel_2_double.cu
+++ b/test/test_sph_hankel_2_double.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::sph_hankel_2(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_sph_hankel_2_float.cu
+++ b/test/test_sph_hankel_2_float.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, boost::math::complex<float_type> *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<boost::math::complex<float_type>> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<boost::math::complex<float_type>> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results[i] = boost::math::sph_hankel_2(input_vector1[i], input_vector2[i]);
+    double t = w.elapsed();
+    // check the results
+    int failure_counter = 0;
+    for(int i = 0; i < numElements; ++i)
+    {
+        const auto eps = boost::math::epsilon_difference(output_vector[i].real(), results[i].real());
+        if (eps > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!\n"
+                        << "Device: " << output_vector[i].real() << ", " << output_vector[i].imag()
+                        << "\n  Host: " << results[i].real() << ", " << results[i].imag()
+                        << "\n   Eps: " << eps << std::endl;
+            ++failure_counter;
+            if (failure_counter > 100)
+            {
+                break;
+            }
+        }
+    }
+
+    if (failure_counter > 0)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_sph_hankel_2_nvrtc_double.cpp
+++ b/test/test_sph_hankel_2_nvrtc_double.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_sph_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_2_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_sph_hankel_2_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_2_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::sph_hankel_2(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_sph_hankel_2_nvrtc_float.cpp
+++ b/test/test_sph_hankel_2_nvrtc_float.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/tools/complex.hpp>
+#include <boost/math/special_functions/hankel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/hankel.hpp>
+extern "C" __global__ 
+void test_sph_hankel_2_kernel(const float_type *in1, const float_type* in2, boost::math::complex<float_type> *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_hankel_2(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_sph_hankel_2_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_sph_hankel_2_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_sph_hankel_2_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2;
+        float_type *d_in1, *d_in2;
+        boost::math::complex<float_type> *h_out, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new boost::math::complex<float_type>[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(boost::math::complex<float_type>)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(boost::math::complex<float_type>), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        int fail_counter = 0;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::sph_hankel_2(h_in1[i], h_in2[i]);
+            if (boost::math::epsilon_difference(res.real(), h_out[i].real()) > 300)
+            {
+                std::cout << "error at line: " << i
+                        << "\nParallel: " << h_out[i].real() << ", " << h_out[i].imag()
+                        << "\n  Serial: " << res.real() << ", " << res.imag()
+                        << "\n    Dist: " << boost::math::epsilon_difference(res.real(), h_out[i].real()) << std::endl;
+                ++fail_counter;
+                if (fail_counter > 100)
+                {
+                    break;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (fail_counter > 0)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}