Add bessel GPU support

Add GPU support to bessel_i0 Add CUDA and NVRTC testing Add GPU support to bessel_i1 Add CUDA and NVRTC testing of bessel_i1 Add tgamma1pm1 NVRTC impl Add GPU support to iconv Add GPU test to bessel_ik Add SYCL testing of complete bessel_i Add GPU support to bessel_j0 Ignore BOOST_MATH_INSTRUMENT_VARIABLE on NVRTC Add bessel J0 CUDA and NVRTC testing Add GPU support to bessel_j1 Add bessel j1 CUDA and NVRTC testing Add GPU support to bessel jn and jy Add SYCL bessel j testing Add bessel_k0 GPU support Add bessel_k0 CUDA and NVRTC testing Add GPU support to bessel_k1 Add bessel_k1 CUDA and NVRTC testing Add GPU support to bessel_kn Add bessel_kn CUDA and NVRTC testing Add SYCL testing of complete bessel_k Make newton-rhapson GPU compatible Make the completed bessel functions GPU compatible Add SYCL bessel y testing Apply changes for non-empty policy on CUDA Add NVCC cyl_bessel_i testing Add GPU support to sinc Add GPU support to series functions Add GPU support to bessel_jy_zero Add array helper type Make hypot GPU safe Make bessel_yX GPU capable Add bessel_y0 and bessel_y1 CUDA testing Add nvrtc testing of bessel_y0 and bessel_y1 Fix macros Add missing header Add missing header Markup iconv Add iround for NVRTC Add tgamma1pm1 with policy overload for NVRTC Disable header Fix factorial support for CUDA platforms Add definition of bessel traits Add cyl_bessel_i NVRTC testing Fix cyl_bessel_jy warnings Fix CUDA forward declarations Fix maybe-unused variable warning Add CUDA cyl_bessel_j testing Add sign overload for lgamma Fix warnings Add NVRTC cyl_bessel_j testing Add NVCC sph_bessel testing Add NVRTC testing of sph_bessel Add NVRTC testing of cyl_bessel_k Add NVCC testing of cyl_bessel_k Add NVCC testing of cyl_neumann Add NVRTC cyl_neumann testing Add NVRTC sph_neumann testing Add NVCC sph_neumann testing
2026-01-19 04:22:09 +00:00 · 2024-08-20 16:40:47 -04:00
parent 1d40454024
commit 047c206c30
104 changed files with 10079 additions and 675 deletions
--- a/include/boost/math/special_functions/bessel.hpp
+++ b/include/boost/math/special_functions/bessel.hpp
@@ -15,8 +15,14 @@
 #  pragma once
 #endif

-#include <limits>
-#include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/rational.hpp>
+#include <boost/math/tools/promotion.hpp>
+#include <boost/math/tools/series.hpp>
+#include <boost/math/tools/roots.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/cstdint.hpp>
 #include <boost/math/special_functions/detail/bessel_jy.hpp>
 #include <boost/math/special_functions/detail/bessel_jn.hpp>
 #include <boost/math/special_functions/detail/bessel_yn.hpp>
@@ -31,10 +37,8 @@
 #include <boost/math/special_functions/sinc.hpp>
 #include <boost/math/special_functions/trunc.hpp>
 #include <boost/math/special_functions/round.hpp>
-#include <boost/math/tools/rational.hpp>
-#include <boost/math/tools/promotion.hpp>
-#include <boost/math/tools/series.hpp>
-#include <boost/math/tools/roots.hpp>
+#include <boost/math/policies/error_handling.hpp>
+#include <boost/math/special_functions/math_fwd.hpp>

 #ifdef _MSC_VER
 # pragma warning(push)
@@ -43,6 +47,50 @@

 namespace boost{ namespace math{

+// Since we cannot pull this in from math fwd we need a copy
+#ifdef BOOST_MATH_HAS_NVRTC
+
+namespace detail{
+
+      typedef boost::math::integral_constant<int, 0> bessel_no_int_tag;      // No integer optimisation possible.
+      typedef boost::math::integral_constant<int, 1> bessel_maybe_int_tag;   // Maybe integer optimisation.
+      typedef boost::math::integral_constant<int, 2> bessel_int_tag;         // Definite integer optimisation.
+
+      template <class T1, class T2, class Policy>
+      struct bessel_traits
+      {
+         using result_type = typename boost::math::conditional<
+            boost::math::is_integral<T1>::value,
+            typename tools::promote_args<T2>::type,
+            tools::promote_args_t<T1, T2>
+         >::type;
+
+         typedef typename policies::precision<result_type, Policy>::type precision_type;
+
+         using optimisation_tag = typename boost::math::conditional<
+            (precision_type::value <= 0 || precision_type::value > 64),
+            bessel_no_int_tag,
+            typename boost::math::conditional<
+               boost::math::is_integral<T1>::value,
+               bessel_int_tag,
+               bessel_maybe_int_tag
+            >::type
+         >::type;
+
+         using optimisation_tag128 = typename boost::math::conditional<
+            (precision_type::value <= 0 || precision_type::value > 113),
+            bessel_no_int_tag,
+            typename boost::math::conditional<
+               boost::math::is_integral<T1>::value,
+               bessel_int_tag,
+               bessel_maybe_int_tag
+            >::type
+         >::type;
+      };
+   } // detail
+
+#endif
+
 namespace detail{

 template <class T, class Policy>
@@ -50,7 +98,7 @@ struct sph_bessel_j_small_z_series_term
 {
   typedef T result_type;

-   sph_bessel_j_small_z_series_term(unsigned v_, T x)
+   BOOST_MATH_GPU_ENABLED sph_bessel_j_small_z_series_term(unsigned v_, T x)
      : N(0), v(v_)
   {
      BOOST_MATH_STD_USING
@@ -64,7 +112,7 @@ struct sph_bessel_j_small_z_series_term
         term = pow(mult, T(v)) / boost::math::tgamma(v+1+T(0.5f), Policy());
      mult *= -mult;
   }
-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
   {
      T r = term;
      ++N;
@@ -79,11 +127,11 @@ private:
 };

 template <class T, class Policy>
-inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol)
 {
   BOOST_MATH_STD_USING // ADL of std names
   sph_bessel_j_small_z_series_term<T, Policy> s(v, x);
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();

   T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon<T, Policy>(), max_iter);

@@ -92,10 +140,21 @@ inline T sph_bessel_j_small_z_series(unsigned v, T x, const Policy& pol)
 }

 template <class T, class Policy>
-T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp_final(T v, T x, const bessel_no_int_tag& t, const Policy& pol)
 {
   BOOST_MATH_STD_USING
-   static const char* function = "boost::math::bessel_j<%1%>(%1%,%1%)";
+
+   T result_J, y; // LCOV_EXCL_LINE
+   bessel_jy(v, x, &result_J, &y, need_j, pol);
+   return result_J;
+}
+
+// Dispatch funtion to avoid recursion
+template <class T, class Policy>
+BOOST_MATH_GPU_ENABLED T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol)
+{
+   BOOST_MATH_STD_USING
+
   if(x < 0)
   {
      // better have integer v:
@@ -105,23 +164,27 @@ T cyl_bessel_j_imp(T v, T x, const bessel_no_int_tag& t, const Policy& pol)
         // This branch is hit by multiprecision types only, and is
         // tested by our real_concept tests, but thee are excluded from coverage
         // due to time constraints.
-         T r = cyl_bessel_j_imp(v, T(-x), t, pol);
+         T r = cyl_bessel_j_imp_final(T(v), T(-x), t, pol);
         if (iround(v, pol) & 1)
+         {
            r = -r;
+         }
+
         return r;
         // LCOV_EXCL_STOP
      }
      else
+      {
+         constexpr auto function = "boost::math::bessel_j<%1%>(%1%,%1%)";
         return policies::raise_domain_error<T>(function, "Got x = %1%, but we need x >= 0", x, pol);
+      }
   }

-   T result_J, y; // LCOV_EXCL_LINE
-   bessel_jy(v, x, &result_J, &y, need_j, pol);
-   return result_J;
+   return cyl_bessel_j_imp_final(T(v), T(x), t, pol);
 }

 template <class T, class Policy>
-inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
 {
   BOOST_MATH_STD_USING  // ADL of std names.
   int ival = detail::iconv(v, pol);
@@ -135,14 +198,14 @@ inline T cyl_bessel_j_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& p
 }

 template <class T, class Policy>
-inline T cyl_bessel_j_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   return bessel_jn(v, x, pol);
 }

 template <class T, class Policy>
-inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol)
 {
   BOOST_MATH_STD_USING // ADL of std names
   if(x < 0)
@@ -171,7 +234,7 @@ inline T sph_bessel_j_imp(unsigned n, T x, const Policy& pol)
 }

 template <class T, class Policy>
-T cyl_bessel_i_imp(T v, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T cyl_bessel_i_imp_final(T v, T x, const Policy& pol)
 {
   //
   // This handles all the bessel I functions, note that we don't optimise
@@ -180,20 +243,7 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol)
   // case has better error handling too).
   //
   BOOST_MATH_STD_USING
-   static const char* function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)";
-   if(x < 0)
-   {
-      // better have integer v:
-      if(floor(v) == v)
-      {
-         T r = cyl_bessel_i_imp(v, T(-x), pol);
-         if(iround(v, pol) & 1)
-            r = -r;
-         return r;
-      }
-      else
-         return policies::raise_domain_error<T>(function, "Got x = %1%, but we need x >= 0", x, pol);
-   }
+   constexpr auto function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)";
   if(x == 0)
   {
      if(v < 0) 
@@ -210,7 +260,7 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol)
      }
      return sqrt(2 / (x * constants::pi<T>())) * sinh(x);
   }
-   if((policies::digits<T, Policy>() <= 113) && (std::numeric_limits<T>::digits <= 113) && (std::numeric_limits<T>::radix == 2))
+   if((policies::digits<T, Policy>() <= 113) && (boost::math::numeric_limits<T>::digits <= 113) && (boost::math::numeric_limits<T>::radix == 2))
   {
      if(v == 0)
      {
@@ -228,10 +278,39 @@ T cyl_bessel_i_imp(T v, T x, const Policy& pol)
   return result_I;
 }

+// Additional dispatch function to get the GPU impls happy
 template <class T, class Policy>
-inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T cyl_bessel_i_imp(T v, T x, const Policy& pol)
 {
-   static const char* function = "boost::math::cyl_bessel_k<%1%>(%1%,%1%)";
+   BOOST_MATH_STD_USING
+   constexpr auto function = "boost::math::cyl_bessel_i<%1%>(%1%,%1%)";
+
+   if(x < 0)
+   {
+      // better have integer v:
+      if(floor(v) == v)
+      {
+         T r = cyl_bessel_i_imp_final(T(v), T(-x), pol);
+         if(iround(v, pol) & 1)
+         {
+            r = -r;
+         }
+         
+         return r;
+      }
+      else
+      {
+         return policies::raise_domain_error<T>(function, "Got x = %1%, but we need x >= 0", x, pol);
+      }
+   }
+   
+   return cyl_bessel_i_imp_final(T(v), T(x), pol);
+}
+
+template <class T, class Policy>
+BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Policy& pol)
+{
+   constexpr auto function = "boost::math::cyl_bessel_k<%1%>(%1%,%1%)";
   BOOST_MATH_STD_USING
   if(x < 0)
   {
@@ -248,7 +327,7 @@ inline T cyl_bessel_k_imp(T v, T x, const bessel_no_int_tag& /* t */, const Poli
 }

 template <class T, class Policy>
-inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   if((floor(v) == v))
@@ -259,15 +338,15 @@ inline T cyl_bessel_k_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& p
 }

 template <class T, class Policy>
-inline T cyl_bessel_k_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T cyl_bessel_k_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
 {
   return bessel_kn(v, x, pol);
 }

 template <class T, class Policy>
-inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol)
 {
-   static const char* function = "boost::math::cyl_neumann<%1%>(%1%,%1%)";
+   constexpr auto function = "boost::math::cyl_neumann<%1%>(%1%,%1%)";

   BOOST_MATH_INSTRUMENT_VARIABLE(v);
   BOOST_MATH_INSTRUMENT_VARIABLE(x);
@@ -291,7 +370,7 @@ inline T cyl_neumann_imp(T v, T x, const bessel_no_int_tag&, const Policy& pol)
 }

 template <class T, class Policy>
-inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& pol)
 {
   BOOST_MATH_STD_USING

@@ -310,16 +389,16 @@ inline T cyl_neumann_imp(T v, T x, const bessel_maybe_int_tag&, const Policy& po
 }

 template <class T, class Policy>
-inline T cyl_neumann_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T cyl_neumann_imp(int v, T x, const bessel_int_tag&, const Policy& pol)
 {
   return bessel_yn(v, x, pol);
 }

 template <class T, class Policy>
-inline T sph_neumann_imp(unsigned v, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T sph_neumann_imp(unsigned v, T x, const Policy& pol)
 {
   BOOST_MATH_STD_USING // ADL of std names
-   static const char* function = "boost::math::sph_neumann<%1%>(%1%,%1%)";
+   constexpr auto function = "boost::math::sph_neumann<%1%>(%1%,%1%)";
   //
   // Nothing much to do here but check for errors, and
   // evaluate the function's definition directly:
@@ -340,11 +419,11 @@ inline T sph_neumann_imp(unsigned v, T x, const Policy& pol)
 }

 template <class T, class Policy>
-inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol)
 {
   BOOST_MATH_STD_USING // ADL of std names, needed for floor.

-   static const char* function = "boost::math::cyl_bessel_j_zero<%1%>(%1%, int)";
+   constexpr auto function = "boost::math::cyl_bessel_j_zero<%1%>(%1%, int)";

   const T half_epsilon(boost::math::tools::epsilon<T>() / 2U);

@@ -395,7 +474,7 @@ inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol)
   const T guess_root = boost::math::detail::bessel_zero::cyl_bessel_j_zero_detail::initial_guess<T, Policy>((order_is_integer ? vv : v), m, pol);

   // Select the maximum allowed iterations from the policy.
-   std::uintmax_t number_of_iterations = policies::get_max_root_iterations<Policy>();
+   boost::math::uintmax_t number_of_iterations = policies::get_max_root_iterations<Policy>();

   const T delta_lo = ((guess_root > 0.2F) ? T(0.2) : T(guess_root / 2U));

@@ -418,11 +497,11 @@ inline T cyl_bessel_j_zero_imp(T v, int m, const Policy& pol)
 }

 template <class T, class Policy>
-inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol)
 {
   BOOST_MATH_STD_USING // ADL of std names, needed for floor.

-   static const char* function = "boost::math::cyl_neumann_zero<%1%>(%1%, int)";
+   constexpr auto function = "boost::math::cyl_neumann_zero<%1%>(%1%, int)";

   // Handle non-finite order.
   if (!(boost::math::isfinite)(v) )
@@ -473,7 +552,7 @@ inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol)
   const T guess_root = boost::math::detail::bessel_zero::cyl_neumann_zero_detail::initial_guess<T, Policy>(v, m, pol);

   // Select the maximum allowed iterations from the policy.
-   std::uintmax_t number_of_iterations = policies::get_max_root_iterations<Policy>();
+   boost::math::uintmax_t number_of_iterations = policies::get_max_root_iterations<Policy>();

   const T delta_lo = ((guess_root > 0.2F) ? T(0.2) : T(guess_root / 2U));

@@ -498,7 +577,7 @@ inline T cyl_neumann_zero_imp(T v, int m, const Policy& pol)
 } // namespace detail

 template <class T1, class T2, class Policy>
-inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j(T1 v, T2 x, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j(T1 v, T2 x, const Policy& /* pol */)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -514,13 +593,13 @@ inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j(
 }

 template <class T1, class T2>
-inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j(T1 v, T2 x)
 {
   return cyl_bessel_j(v, x, policies::policy<>());
 }

 template <class T, class Policy>
-inline typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel(unsigned v, T x, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel(unsigned v, T x, const Policy& /* pol */)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T, T, Policy>::result_type result_type;
@@ -535,13 +614,13 @@ inline typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel(unsi
 }

 template <class T>
-inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel(unsigned v, T x)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel(unsigned v, T x)
 {
   return sph_bessel(v, x, policies::policy<>());
 }

 template <class T1, class T2, class Policy>
-inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i(T1 v, T2 x, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i(T1 v, T2 x, const Policy& /* pol */)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -556,13 +635,13 @@ inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i(
 }

 template <class T1, class T2>
-inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i(T1 v, T2 x)
 {
   return cyl_bessel_i(v, x, policies::policy<>());
 }

 template <class T1, class T2, class Policy>
-inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k(T1 v, T2 x, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k(T1 v, T2 x, const Policy& /* pol */)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -578,13 +657,13 @@ inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k(
 }

 template <class T1, class T2>
-inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k(T1 v, T2 x)
 {
   return cyl_bessel_k(v, x, policies::policy<>());
 }

 template <class T1, class T2, class Policy>
-inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann(T1 v, T2 x, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann(T1 v, T2 x, const Policy& /* pol */)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T1, T2, Policy>::result_type result_type;
@@ -600,13 +679,13 @@ inline typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann(T
 }

 template <class T1, class T2>
-inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann(T1 v, T2 x)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann(T1 v, T2 x)
 {
   return cyl_neumann(v, x, policies::policy<>());
 }

 template <class T, class Policy>
-inline typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann(unsigned v, T x, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann(unsigned v, T x, const Policy& /* pol */)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T, T, Policy>::result_type result_type;
@@ -621,13 +700,13 @@ inline typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann(uns
 }

 template <class T>
-inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann(unsigned v, T x)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann(unsigned v, T x)
 {
   return sph_neumann(v, x, policies::policy<>());
 }

 template <class T, class Policy>
-inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_bessel_j_zero(T v, int m, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_bessel_j_zero(T v, int m, const Policy& /* pol */)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T, T, Policy>::result_type result_type;
@@ -639,35 +718,35 @@ inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_bessel_j_ze
      policies::discrete_quantile<>,
      policies::assert_undefined<> >::type forwarding_policy;

-   static_assert(    false == std::numeric_limits<T>::is_specialized
-                           || (   true  == std::numeric_limits<T>::is_specialized
-                               && false == std::numeric_limits<T>::is_integer),
+   static_assert(    false == boost::math::numeric_limits<T>::is_specialized
+                           || (   true  == boost::math::numeric_limits<T>::is_specialized
+                               && false == boost::math::numeric_limits<T>::is_integer),
                           "Order must be a floating-point type.");

   return policies::checked_narrowing_cast<result_type, Policy>(detail::cyl_bessel_j_zero_imp<value_type>(v, m, forwarding_policy()), "boost::math::cyl_bessel_j_zero<%1%>(%1%,%1%)");
 }

 template <class T>
-inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_bessel_j_zero(T v, int m)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_bessel_j_zero(T v, int m)
 {
-   static_assert(    false == std::numeric_limits<T>::is_specialized
-                           || (   true  == std::numeric_limits<T>::is_specialized
-                               && false == std::numeric_limits<T>::is_integer),
+   static_assert(    false == boost::math::numeric_limits<T>::is_specialized
+                           || (   true  == boost::math::numeric_limits<T>::is_specialized
+                               && false == boost::math::numeric_limits<T>::is_integer),
                           "Order must be a floating-point type.");

   return cyl_bessel_j_zero<T, policies::policy<> >(v, m, policies::policy<>());
 }

 template <class T, class OutputIterator, class Policy>
-inline OutputIterator cyl_bessel_j_zero(T v,
+BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_bessel_j_zero(T v,
                              int start_index,
                              unsigned number_of_zeros,
                              OutputIterator out_it,
                              const Policy& pol)
 {
-   static_assert(    false == std::numeric_limits<T>::is_specialized
-                           || (   true  == std::numeric_limits<T>::is_specialized
-                               && false == std::numeric_limits<T>::is_integer),
+   static_assert(    false == boost::math::numeric_limits<T>::is_specialized
+                           || (   true  == boost::math::numeric_limits<T>::is_specialized
+                               && false == boost::math::numeric_limits<T>::is_integer),
                           "Order must be a floating-point type.");

   for(int i = 0; i < static_cast<int>(number_of_zeros); ++i)
@@ -679,7 +758,7 @@ inline OutputIterator cyl_bessel_j_zero(T v,
 }

 template <class T, class OutputIterator>
-inline OutputIterator cyl_bessel_j_zero(T v,
+BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_bessel_j_zero(T v,
                              int start_index,
                              unsigned number_of_zeros,
                              OutputIterator out_it)
@@ -688,7 +767,7 @@ inline OutputIterator cyl_bessel_j_zero(T v,
 }

 template <class T, class Policy>
-inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_neumann_zero(T v, int m, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_neumann_zero(T v, int m, const Policy& /* pol */)
 {
   BOOST_FPU_EXCEPTION_GUARD
   typedef typename detail::bessel_traits<T, T, Policy>::result_type result_type;
@@ -700,35 +779,35 @@ inline typename detail::bessel_traits<T, T, Policy>::result_type cyl_neumann_zer
      policies::discrete_quantile<>,
      policies::assert_undefined<> >::type forwarding_policy;

-   static_assert(    false == std::numeric_limits<T>::is_specialized
-                           || (   true  == std::numeric_limits<T>::is_specialized
-                               && false == std::numeric_limits<T>::is_integer),
+   static_assert(    false == boost::math::numeric_limits<T>::is_specialized
+                           || (   true  == boost::math::numeric_limits<T>::is_specialized
+                               && false == boost::math::numeric_limits<T>::is_integer),
                           "Order must be a floating-point type.");

   return policies::checked_narrowing_cast<result_type, Policy>(detail::cyl_neumann_zero_imp<value_type>(v, m, forwarding_policy()), "boost::math::cyl_neumann_zero<%1%>(%1%,%1%)");
 }

 template <class T>
-inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_neumann_zero(T v, int m)
+BOOST_MATH_GPU_ENABLED inline typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_neumann_zero(T v, int m)
 {
-   static_assert(    false == std::numeric_limits<T>::is_specialized
-                           || (   true  == std::numeric_limits<T>::is_specialized
-                               && false == std::numeric_limits<T>::is_integer),
+   static_assert(    false == boost::math::numeric_limits<T>::is_specialized
+                           || (   true  == boost::math::numeric_limits<T>::is_specialized
+                               && false == boost::math::numeric_limits<T>::is_integer),
                           "Order must be a floating-point type.");

   return cyl_neumann_zero<T, policies::policy<> >(v, m, policies::policy<>());
 }

 template <class T, class OutputIterator, class Policy>
-inline OutputIterator cyl_neumann_zero(T v,
+BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_neumann_zero(T v,
                             int start_index,
                             unsigned number_of_zeros,
                             OutputIterator out_it,
                             const Policy& pol)
 {
-   static_assert(    false == std::numeric_limits<T>::is_specialized
-                           || (   true  == std::numeric_limits<T>::is_specialized
-                               && false == std::numeric_limits<T>::is_integer),
+   static_assert(    false == boost::math::numeric_limits<T>::is_specialized
+                           || (   true  == boost::math::numeric_limits<T>::is_specialized
+                               && false == boost::math::numeric_limits<T>::is_integer),
                           "Order must be a floating-point type.");

   for(int i = 0; i < static_cast<int>(number_of_zeros); ++i)
@@ -740,7 +819,7 @@ inline OutputIterator cyl_neumann_zero(T v,
 }

 template <class T, class OutputIterator>
-inline OutputIterator cyl_neumann_zero(T v,
+BOOST_MATH_GPU_ENABLED inline OutputIterator cyl_neumann_zero(T v,
                             int start_index,
                             unsigned number_of_zeros,
                             OutputIterator out_it)
--- a/include/boost/math/special_functions/detail/airy_ai_bi_zero.hpp
+++ b/include/boost/math/special_functions/detail/airy_ai_bi_zero.hpp
@@ -13,6 +13,8 @@
 #ifndef BOOST_MATH_AIRY_AI_BI_ZERO_2013_01_20_HPP_
  #define BOOST_MATH_AIRY_AI_BI_ZERO_2013_01_20_HPP_

+  #include <boost/math/tools/config.hpp>
+  #include <boost/math/tools/tuple.hpp>
  #include <boost/math/constants/constants.hpp>
  #include <boost/math/special_functions/cbrt.hpp>

@@ -21,18 +23,18 @@
  {
    // Forward declarations of the needed Airy function implementations.
    template <class T, class Policy>
-    T airy_ai_imp(T x, const Policy& pol);
+    BOOST_MATH_GPU_ENABLED T airy_ai_imp(T x, const Policy& pol);
    template <class T, class Policy>
-    T airy_bi_imp(T x, const Policy& pol);
+    BOOST_MATH_GPU_ENABLED T airy_bi_imp(T x, const Policy& pol);
    template <class T, class Policy>
-    T airy_ai_prime_imp(T x, const Policy& pol);
+    BOOST_MATH_GPU_ENABLED T airy_ai_prime_imp(T x, const Policy& pol);
    template <class T, class Policy>
-    T airy_bi_prime_imp(T x, const Policy& pol);
+    BOOST_MATH_GPU_ENABLED T airy_bi_prime_imp(T x, const Policy& pol);

    namespace airy_zero
    {
      template<class T, class Policy>
-      T equation_as_10_4_105(const T& z, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED T equation_as_10_4_105(const T& z, const Policy& pol)
      {
        const T one_over_z        (T(1) / z);
        const T one_over_z_squared(one_over_z * one_over_z);
@@ -54,7 +56,7 @@
      namespace airy_ai_zero_detail
      {
        template<class T, class Policy>
-        T initial_guess(const int m, const Policy& pol)
+        BOOST_MATH_GPU_ENABLED T initial_guess(const int m, const Policy& pol)
        {
          T guess;

@@ -106,11 +108,19 @@
        class function_object_ai_and_ai_prime
        {
        public:
-          explicit function_object_ai_and_ai_prime(const Policy& pol) : my_pol(pol) { }
+          BOOST_MATH_GPU_ENABLED explicit function_object_ai_and_ai_prime(const Policy& pol) : my_pol(pol) { }

-          function_object_ai_and_ai_prime(const function_object_ai_and_ai_prime&) = default;
+          #ifdef BOOST_MATH_ENABLE_CUDA
+          #  pragma nv_diag_suppress 20012
+          #endif

-          boost::math::tuple<T, T> operator()(const T& x) const
+          BOOST_MATH_GPU_ENABLED function_object_ai_and_ai_prime(const function_object_ai_and_ai_prime&) = default;
+
+          #ifdef BOOST_MATH_ENABLE_CUDA
+          #  pragma nv_diag_default 20012
+          #endif
+
+          BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(const T& x) const
          {
            // Return a tuple containing both Ai(x) and Ai'(x).
            return boost::math::make_tuple(
@@ -127,7 +137,7 @@
      namespace airy_bi_zero_detail
      {
        template<class T, class Policy>
-        T initial_guess(const int m, const Policy& pol)
+        BOOST_MATH_GPU_ENABLED T initial_guess(const int m, const Policy& pol)
        {
          T guess;

@@ -179,11 +189,19 @@
        class function_object_bi_and_bi_prime
        {
        public:
-          explicit function_object_bi_and_bi_prime(const Policy& pol) : my_pol(pol) { }
+          BOOST_MATH_GPU_ENABLED explicit function_object_bi_and_bi_prime(const Policy& pol) : my_pol(pol) { }

-          function_object_bi_and_bi_prime(const function_object_bi_and_bi_prime&) = default;
+          #ifdef BOOST_MATH_ENABLE_CUDA
+          #  pragma nv_diag_suppress 20012
+          #endif
+          
+          BOOST_MATH_GPU_ENABLED function_object_bi_and_bi_prime(const function_object_bi_and_bi_prime&) = default;
+          
+          #ifdef BOOST_MATH_ENABLE_CUDA
+          #  pragma nv_diag_default 20012
+          #endif

-          boost::math::tuple<T, T> operator()(const T& x) const
+          BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(const T& x) const
          {
            // Return a tuple containing both Bi(x) and Bi'(x).
            return boost::math::make_tuple(
--- a/include/boost/math/special_functions/detail/bessel_i0.hpp
+++ b/include/boost/math/special_functions/detail/bessel_i0.hpp
@@ -1,5 +1,6 @@
 //  Copyright (c) 2006 Xiaogang Zhang
 //  Copyright (c) 2017 John Maddock
+//  Copyright (c) 2024 Matt Borland
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -14,6 +15,9 @@
 #include <boost/math/tools/rational.hpp>
 #include <boost/math/tools/big_constant.hpp>
 #include <boost/math/tools/assert.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/precision.hpp>

 #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
 //
@@ -35,24 +39,24 @@
 namespace boost { namespace math { namespace detail{

 template <typename T>
-T bessel_i0(const T& x);
+BOOST_MATH_GPU_ENABLED T bessel_i0(const T& x);

 template <typename T, int N>
-T bessel_i0_imp(const T&, const std::integral_constant<int, N>&)
+BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T&, const boost::math::integral_constant<int, N>&)
 {
   BOOST_MATH_ASSERT(0);
   return 0;
 }

 template <typename T>
-T bessel_i0_imp(const T& x, const std::integral_constant<int, 24>&)
+BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant<int, 24>&)
 {
   BOOST_MATH_STD_USING
   if(x < 7.75)
   {
      // Max error in interpolated form: 3.929e-08
      // Max Error found at float precision = Poly: 1.991226e-07
-      static const float P[] = {
+      BOOST_MATH_STATIC const float P[] = {
         1.00000003928615375e+00f,
         2.49999576572179639e-01f,
         2.77785268558399407e-02f,
@@ -70,7 +74,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 24>&)
   {
      // Max error in interpolated form: 5.195e-08
      // Max Error found at float precision = Poly: 8.502534e-08
-      static const float P[] = {
+      BOOST_MATH_STATIC const float P[] = {
         3.98942651588301770e-01f,
         4.98327234176892844e-02f,
         2.91866904423115499e-02f,
@@ -83,7 +87,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 24>&)
   {
      // Max error in interpolated form: 1.782e-09
      // Max Error found at float precision = Poly: 6.473568e-08
-      static const float P[] = {
+      BOOST_MATH_STATIC const float P[] = {
         3.98942391532752700e-01f,
         4.98455950638200020e-02f,
         2.94835666900682535e-02f
@@ -96,7 +100,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 24>&)
 }

 template <typename T>
-T bessel_i0_imp(const T& x, const std::integral_constant<int, 53>&)
+BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant<int, 53>&)
 {
   BOOST_MATH_STD_USING
   if(x < 7.75)
@@ -104,7 +108,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 53>&)
      // Bessel I0 over[10 ^ -16, 7.75]
      // Max error in interpolated form : 3.042e-18
      // Max Error found at double precision = Poly : 5.106609e-16 Cheb : 5.239199e-16
-      static const double P[] = {
+      BOOST_MATH_STATIC const double P[] = {
         1.00000000000000000e+00,
         2.49999999999999909e-01,
         2.77777777777782257e-02,
@@ -128,7 +132,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 53>&)
   {
      // Max error in interpolated form : 1.685e-16
      // Max Error found at double precision = Poly : 2.575063e-16 Cheb : 2.247615e+00
-      static const double P[] = {
+      BOOST_MATH_STATIC const double P[] = {
         3.98942280401425088e-01,
         4.98677850604961985e-02,
         2.80506233928312623e-02,
@@ -158,7 +162,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 53>&)
   {
      // Max error in interpolated form : 2.437e-18
      // Max Error found at double precision = Poly : 1.216719e-16
-      static const double P[] = {
+      BOOST_MATH_STATIC const double P[] = {
         3.98942280401432905e-01,
         4.98677850491434560e-02,
         2.80506308916506102e-02,
@@ -173,7 +177,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 53>&)
 }

 template <typename T>
-T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
+BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant<int, 64>&)
 {
   BOOST_MATH_STD_USING
   if(x < 7.75)
@@ -182,7 +186,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
      // Max error in interpolated form : 3.899e-20
      // Max Error found at float80 precision = Poly : 1.770840e-19
      // LCOV_EXCL_START
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 64, 9.99999999999999999961011629e-01),
         BOOST_MATH_BIG_CONSTANT(T, 64, 2.50000000000000001321873912e-01),
         BOOST_MATH_BIG_CONSTANT(T, 64, 2.77777777777777703400424216e-02),
@@ -211,8 +215,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
      // Maximum Relative Change in Control Points : 1.631e-04
      // Max Error found at float80 precision = Poly : 7.811948e-21
      // LCOV_EXCL_START
-      static const T Y = 4.051098823547363281250e-01f;
-      static const T P[] = {
+      BOOST_MATH_STATIC const T Y = 4.051098823547363281250e-01f;
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 64, -6.158081780620616479492e-03),
         BOOST_MATH_BIG_CONSTANT(T, 64, 4.883635969834048766148e-02),
         BOOST_MATH_BIG_CONSTANT(T, 64, 7.892782002476195771920e-02),
@@ -237,8 +241,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
      // Maximum Relative Change in Control Points : 1.304e-03
      // Max Error found at float80 precision = Poly : 2.303527e-20
      // LCOV_EXCL_START
-      static const T Y = 4.033188819885253906250e-01f;
-      static const T P[] = {
+      BOOST_MATH_STATIC const T Y = 4.033188819885253906250e-01f;
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 64, -4.376373876116109401062e-03),
         BOOST_MATH_BIG_CONSTANT(T, 64, 4.982899138682911273321e-02),
         BOOST_MATH_BIG_CONSTANT(T, 64, 3.109477529533515397644e-02),
@@ -262,8 +266,8 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
      // Max error in interpolated form: 1.035e-21
      // Max Error found at float80 precision = Poly: 1.885872e-21
      // LCOV_EXCL_START
-      static const T Y = 4.011702537536621093750e-01f;
-      static const T P[] = {
+      BOOST_MATH_STATIC const T Y = 4.011702537536621093750e-01f;
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 64, -2.227973351806078464328e-03),
         BOOST_MATH_BIG_CONSTANT(T, 64, 4.986778486088017419036e-02),
         BOOST_MATH_BIG_CONSTANT(T, 64, 2.805066823812285310011e-02),
@@ -291,7 +295,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
      // Max error in interpolated form : 5.587e-20
      // Max Error found at float80 precision = Poly : 8.776852e-20
      // LCOV_EXCL_START
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401432677955074061e-01),
         BOOST_MATH_BIG_CONSTANT(T, 64, 4.98677850501789875615574058e-02),
         BOOST_MATH_BIG_CONSTANT(T, 64, 2.80506290908675604202206833e-02),
@@ -320,7 +324,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 64>&)
 }

 template <typename T>
-T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
+BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant<int, 113>&)
 {
   BOOST_MATH_STD_USING
   if(x < 7.75)
@@ -329,7 +333,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
      // Max error in interpolated form : 1.274e-34
      // Max Error found at float128 precision = Poly : 3.096091e-34
      // LCOV_EXCL_START
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 1.0000000000000000000000000000000001273856e+00),
         BOOST_MATH_BIG_CONSTANT(T, 113, 2.4999999999999999999999999999999107477496e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, 2.7777777777777777777777777777881795230918e-02),
@@ -364,7 +368,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
      // Max error in interpolated form : 7.534e-35
      // Max Error found at float128 precision = Poly : 6.123912e-34
      // LCOV_EXCL_START
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 9.9999999999999999992388573069504617493518e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, 2.5000000000000000007304739268173096975340e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, 2.7777777777777777744261405400543564492074e-02),
@@ -403,7 +407,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
      // Max error in interpolated form : 1.808e-34
      // Max Error found at float128 precision = Poly : 2.399403e-34
      // LCOV_EXCL_START
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040870793650581242239624530714032e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867780576714783790784348982178607842250e-02),
         BOOST_MATH_BIG_CONSTANT(T, 113, 2.8051948347934462928487999569249907599510e-02),
@@ -445,7 +449,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
      // Max error in interpolated form : 1.487e-34
      // Max Error found at float128 precision = Poly : 1.929924e-34
      // LCOV_EXCL_START
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793996798658172135362278e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867785050179084714910130342157246539820e-02),
         BOOST_MATH_BIG_CONSTANT(T, 113, 2.8050629090725751585266360464766768437048e-02),
@@ -480,7 +484,7 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
      // Max error in interpolated form : 5.459e-35
      // Max Error found at float128 precision = Poly : 1.472240e-34
      // LCOV_EXCL_START
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793994605993438166526772e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, 4.9867785050179084742493257495245185241487e-02),
         BOOST_MATH_BIG_CONSTANT(T, 113, 2.8050629090725735167652437695397756897920e-02),
@@ -507,33 +511,33 @@ T bessel_i0_imp(const T& x, const std::integral_constant<int, 113>&)
 }

 template <typename T>
-T bessel_i0_imp(const T& x, const std::integral_constant<int, 0>&)
+BOOST_MATH_GPU_ENABLED T bessel_i0_imp(const T& x, const boost::math::integral_constant<int, 0>&)
 {
   if(boost::math::tools::digits<T>() <= 24)
-      return bessel_i0_imp(x, std::integral_constant<int, 24>());
+      return bessel_i0_imp(x, boost::math::integral_constant<int, 24>());
   else if(boost::math::tools::digits<T>() <= 53)
-      return bessel_i0_imp(x, std::integral_constant<int, 53>());
+      return bessel_i0_imp(x, boost::math::integral_constant<int, 53>());
   else if(boost::math::tools::digits<T>() <= 64)
-      return bessel_i0_imp(x, std::integral_constant<int, 64>());
+      return bessel_i0_imp(x, boost::math::integral_constant<int, 64>());
   else if(boost::math::tools::digits<T>() <= 113)
-      return bessel_i0_imp(x, std::integral_constant<int, 113>());
+      return bessel_i0_imp(x, boost::math::integral_constant<int, 113>());
   BOOST_MATH_ASSERT(0);
   return 0;
 }

 template <typename T>
-inline T bessel_i0(const T& x)
+BOOST_MATH_GPU_ENABLED inline T bessel_i0(const T& x)
 {
-   typedef std::integral_constant<int,
-      ((std::numeric_limits<T>::digits == 0) || (std::numeric_limits<T>::radix != 2)) ?
+   typedef boost::math::integral_constant<int,
+      ((boost::math::numeric_limits<T>::digits == 0) || (boost::math::numeric_limits<T>::radix != 2)) ?
      0 :
-      std::numeric_limits<T>::digits <= 24 ?
+      boost::math::numeric_limits<T>::digits <= 24 ?
      24 :
-      std::numeric_limits<T>::digits <= 53 ?
+      boost::math::numeric_limits<T>::digits <= 53 ?
      53 :
-      std::numeric_limits<T>::digits <= 64 ?
+      boost::math::numeric_limits<T>::digits <= 64 ?
      64 :
-      std::numeric_limits<T>::digits <= 113 ?
+      boost::math::numeric_limits<T>::digits <= 113 ?
      113 : -1
   > tag_type;

--- a/include/boost/math/special_functions/detail/bessel_i1.hpp
+++ b/include/boost/math/special_functions/detail/bessel_i1.hpp
@@ -1,4 +1,5 @@
 //  Copyright (c) 2017 John Maddock
+//  Copyright (c) 2024 Matt Borland
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -17,9 +18,13 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
 #include <boost/math/tools/rational.hpp>
 #include <boost/math/tools/big_constant.hpp>
 #include <boost/math/tools/assert.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/precision.hpp>

 #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
 //
@@ -38,24 +43,24 @@
 namespace boost { namespace math { namespace detail{

 template <typename T>
-T bessel_i1(const T& x);
+BOOST_MATH_GPU_ENABLED T bessel_i1(const T& x);

 template <typename T, int N>
-T bessel_i1_imp(const T&, const std::integral_constant<int, N>&)
+BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T&, const boost::math::integral_constant<int, N>&)
 {
   BOOST_MATH_ASSERT(0);
   return 0;
 }

 template <typename T>
-T bessel_i1_imp(const T& x, const std::integral_constant<int, 24>&)
+BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant<int, 24>&)
 {
   BOOST_MATH_STD_USING
      if(x < 7.75)
      {
         //Max error in interpolated form : 1.348e-08
         // Max Error found at float precision = Poly : 1.469121e-07
-         static const float P[] = {
+         BOOST_MATH_STATIC const float P[] = {
            8.333333221e-02f,
            6.944453712e-03f,
            3.472097211e-04f,
@@ -74,7 +79,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 24>&)
         // Max error in interpolated form: 9.000e-08
         // Max Error found at float precision = Poly: 1.044345e-07

-         static const float P[] = {
+         BOOST_MATH_STATIC const float P[] = {
            3.98942115977513013e-01f,
            -1.49581264836620262e-01f,
            -4.76475741878486795e-02f,
@@ -89,7 +94,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 24>&)
 }

 template <typename T>
-T bessel_i1_imp(const T& x, const std::integral_constant<int, 53>&)
+BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant<int, 53>&)
 {
   BOOST_MATH_STD_USING
   if(x < 7.75)
@@ -98,7 +103,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 53>&)
      // Max error in interpolated form: 5.639e-17
      // Max Error found at double precision = Poly: 1.795559e-16

-      static const double P[] = {
+      BOOST_MATH_STATIC const double P[] = {
         8.333333333333333803e-02,
         6.944444444444341983e-03,
         3.472222222225921045e-04,
@@ -122,7 +127,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 53>&)
      // Max error in interpolated form: 1.796e-16
      // Max Error found at double precision = Poly: 2.898731e-16

-      static const double P[] = {
+      BOOST_MATH_STATIC const double P[] = {
         3.989422804014406054e-01,
         -1.496033551613111533e-01,
         -4.675104253598537322e-02,
@@ -152,7 +157,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 53>&)
   {
      // Max error in interpolated form: 1.320e-19
      // Max Error found at double precision = Poly: 7.065357e-17
-      static const double P[] = {
+      BOOST_MATH_STATIC const double P[] = {
         3.989422804014314820e-01,
         -1.496033551467584157e-01,
         -4.675105322571775911e-02,
@@ -167,7 +172,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 53>&)
 }

 template <typename T>
-T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
+BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant<int, 64>&)
 {
   BOOST_MATH_STD_USING
      if(x < 7.75)
@@ -175,7 +180,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
         // Bessel I0 over[10 ^ -16, 7.75]
         // Max error in interpolated form: 8.086e-21
         // Max Error found at float80 precision = Poly: 7.225090e-20
-         static const T P[] = {
+         BOOST_MATH_STATIC const T P[] = {
            BOOST_MATH_BIG_CONSTANT(T, 64, 8.33333333333333333340071817e-02),
            BOOST_MATH_BIG_CONSTANT(T, 64, 6.94444444444444442462728070e-03),
            BOOST_MATH_BIG_CONSTANT(T, 64, 3.47222222222222318886683883e-04),
@@ -203,7 +208,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
         // Maximum Deviation Found : 3.887e-20
         // Expected Error Term : 3.887e-20
         // Maximum Relative Change in Control Points : 1.681e-04
-         static const T P[] = {
+         BOOST_MATH_STATIC const T P[] = {
            BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942260530218897338680e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, -1.49599542849073670179540e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, -4.70492865454119188276875e-02),
@@ -236,7 +241,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
         // Maximum Relative Change in Control Points : 2.101e-03
         // Max Error found at float80 precision = Poly : 6.029974e-20

-         static const T P[] = {
+         BOOST_MATH_STATIC const T P[] = {
            BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401431675205845e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, -1.49603355149968887210170e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, -4.67510486284376330257260e-02),
@@ -258,7 +263,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
         // Bessel I0 over[100, INF]
         // Max error in interpolated form: 2.456e-20
         // Max Error found at float80 precision = Poly: 5.446356e-20
-         static const T P[] = {
+         BOOST_MATH_STATIC const T P[] = {
            BOOST_MATH_BIG_CONSTANT(T, 64, 3.98942280401432677958445e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, -1.49603355150537411254359e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, -4.67510484842456251368526e-02),
@@ -276,7 +281,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 64>&)
 }

 template <typename T>
-T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
+BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant<int, 113>&)
 {
   BOOST_MATH_STD_USING
   if(x < 7.75)
@@ -285,7 +290,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
      // Max error in interpolated form: 1.835e-35
      // Max Error found at float128 precision = Poly: 1.645036e-34

-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 8.3333333333333333333333333333333331804098e-02),
         BOOST_MATH_BIG_CONSTANT(T, 113, 6.9444444444444444444444444444445418303082e-03),
         BOOST_MATH_BIG_CONSTANT(T, 113, 3.4722222222222222222222222222119082346591e-04),
@@ -321,7 +326,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
      // Maximum Relative Change in Control Points : 5.204e-03
      // Max Error found at float128 precision = Poly : 2.882561e-34

-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 8.333333333333333326889717360850080939e-02),
         BOOST_MATH_BIG_CONSTANT(T, 113, 6.944444444444444511272790848815114507e-03),
         BOOST_MATH_BIG_CONSTANT(T, 113, 3.472222222222221892451965054394153443e-04),
@@ -355,7 +360,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
      // Maximum Deviation Found : 1.766e-35
      // Expected Error Term : 1.021e-35
      // Maximum Relative Change in Control Points : 6.228e-03
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 8.333333333333255774414858563409941233e-02),
         BOOST_MATH_BIG_CONSTANT(T, 113, 6.944444444444897867884955912228700291e-03),
         BOOST_MATH_BIG_CONSTANT(T, 113, 3.472222222220954970397343617150959467e-04),
@@ -389,7 +394,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
   {
      // Max error in interpolated form: 8.864e-36
      // Max Error found at float128 precision = Poly: 8.522841e-35
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422793693152031514179994954750043e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, -1.496029423752889591425633234009799670e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, -4.682975926820553021482820043377990241e-02),
@@ -421,7 +426,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
      // Max error in interpolated form: 6.028e-35
      // Max Error found at float128 precision = Poly: 1.368313e-34

-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422804012941975429616956496046931e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, -1.496033550576049830976679315420681402e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, -4.675107835141866009896710750800622147e-02),
@@ -456,7 +461,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
      // Max error in interpolated form: 5.494e-35
      // Max Error found at float128 precision = Poly: 1.214651e-34

-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 3.989422804014326779399307367861631577e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, -1.496033551505372542086590873271571919e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, -4.675104848454290286276466276677172664e-02),
@@ -486,7 +491,7 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
      // Bessel I0 over[100, INF]
      // Max error in interpolated form: 6.081e-35
      // Max Error found at float128 precision = Poly: 1.407151e-34
-      static const T P[] = {
+      BOOST_MATH_STATIC const T P[] = {
         BOOST_MATH_BIG_CONSTANT(T, 113, 3.9894228040143267793994605993438200208417e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, -1.4960335515053725422747977247811372936584e-01),
         BOOST_MATH_BIG_CONSTANT(T, 113, -4.6751048484542891946087411826356811991039e-02),
@@ -512,33 +517,33 @@ T bessel_i1_imp(const T& x, const std::integral_constant<int, 113>&)
 }

 template <typename T>
-T bessel_i1_imp(const T& x, const std::integral_constant<int, 0>&)
+BOOST_MATH_GPU_ENABLED T bessel_i1_imp(const T& x, const boost::math::integral_constant<int, 0>&)
 {
   if(boost::math::tools::digits<T>() <= 24)
-      return bessel_i1_imp(x, std::integral_constant<int, 24>());
+      return bessel_i1_imp(x, boost::math::integral_constant<int, 24>());
   else if(boost::math::tools::digits<T>() <= 53)
-      return bessel_i1_imp(x, std::integral_constant<int, 53>());
+      return bessel_i1_imp(x, boost::math::integral_constant<int, 53>());
   else if(boost::math::tools::digits<T>() <= 64)
-      return bessel_i1_imp(x, std::integral_constant<int, 64>());
+      return bessel_i1_imp(x, boost::math::integral_constant<int, 64>());
   else if(boost::math::tools::digits<T>() <= 113)
-      return bessel_i1_imp(x, std::integral_constant<int, 113>());
+      return bessel_i1_imp(x, boost::math::integral_constant<int, 113>());
   BOOST_MATH_ASSERT(0);
   return 0;
 }

 template <typename T>
-inline T bessel_i1(const T& x)
+inline BOOST_MATH_GPU_ENABLED T bessel_i1(const T& x)
 {
-   typedef std::integral_constant<int,
-      ((std::numeric_limits<T>::digits == 0) || (std::numeric_limits<T>::radix != 2)) ?
+   typedef boost::math::integral_constant<int,
+      ((boost::math::numeric_limits<T>::digits == 0) || (boost::math::numeric_limits<T>::radix != 2)) ?
      0 :
-      std::numeric_limits<T>::digits <= 24 ?
+      boost::math::numeric_limits<T>::digits <= 24 ?
      24 :
-      std::numeric_limits<T>::digits <= 53 ?
+      boost::math::numeric_limits<T>::digits <= 53 ?
      53 :
-      std::numeric_limits<T>::digits <= 64 ?
+      boost::math::numeric_limits<T>::digits <= 64 ?
      64 :
-      std::numeric_limits<T>::digits <= 113 ?
+      boost::math::numeric_limits<T>::digits <= 113 ?
      113 : -1
   > tag_type;

--- a/include/boost/math/special_functions/detail/bessel_ik.hpp
+++ b/include/boost/math/special_functions/detail/bessel_ik.hpp
@@ -1,4 +1,5 @@
 //  Copyright (c) 2006 Xiaogang Zhang
+//  Copyright (c) 2024 Matt Borland
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,14 +11,17 @@
 #pragma once
 #endif

-#include <cmath>
-#include <cstdint>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/cstdint.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/series.hpp>
+#include <boost/math/special_functions/sign.hpp>
 #include <boost/math/special_functions/round.hpp>
 #include <boost/math/special_functions/gamma.hpp>
 #include <boost/math/special_functions/sin_pi.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/math/policies/error_handling.hpp>
-#include <boost/math/tools/config.hpp>

 // Modified Bessel functions of the first and second kind of fractional order

@@ -30,13 +34,13 @@ struct cyl_bessel_i_small_z
 {
   typedef T result_type;

-   cyl_bessel_i_small_z(T v_, T z_) : k(0), v(v_), mult(z_*z_/4)
+   BOOST_MATH_GPU_ENABLED cyl_bessel_i_small_z(T v_, T z_) : k(0), v(v_), mult(z_*z_/4)
   {
      BOOST_MATH_STD_USING
      term = 1;
   }

-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
   {
      T result = term;
      ++k;
@@ -52,7 +56,7 @@ private:
 };

 template <class T, class Policy>
-inline T bessel_i_small_z_series(T v, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T bessel_i_small_z_series(T v, T x, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   T prefix;
@@ -69,7 +73,7 @@ inline T bessel_i_small_z_series(T v, T x, const Policy& pol)
      return prefix;

   cyl_bessel_i_small_z<T, Policy> s(v, x);
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();

   T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon<T, Policy>(), max_iter);

@@ -80,7 +84,7 @@ inline T bessel_i_small_z_series(T v, T x, const Policy& pol)
 // Calculate K(v, x) and K(v+1, x) by method analogous to
 // Temme, Journal of Computational Physics, vol 21, 343 (1976)
 template <typename T, typename Policy>
-int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol)
+BOOST_MATH_GPU_ENABLED int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol)
 {
    T f, h, p, q, coef, sum, sum1, tolerance;
    T a, b, c, d, sigma, gamma1, gamma2;
@@ -157,7 +161,7 @@ int temme_ik(T v, T x, T* result_K, T* K1, const Policy& pol)
 // Evaluate continued fraction fv = I_(v+1) / I_v, derived from
 // Abramowitz and Stegun, Handbook of Mathematical Functions, 1972, 9.1.73
 template <typename T, typename Policy>
-int CF1_ik(T v, T x, T* fv, const Policy& pol)
+BOOST_MATH_GPU_ENABLED int CF1_ik(T v, T x, T* fv, const Policy& pol)
 {
    T C, D, f, a, b, delta, tiny, tolerance;
    unsigned long k;
@@ -204,7 +208,7 @@ int CF1_ik(T v, T x, T* fv, const Policy& pol)
 // z1 / z0 = U(v+1.5, 2v+1, 2x) / U(v+0.5, 2v+1, 2x), see
 // Thompson and Barnett, Computer Physics Communications, vol 47, 245 (1987)
 template <typename T, typename Policy>
-int CF2_ik(T v, T x, T* Kv, T* Kv1, const Policy& pol)
+BOOST_MATH_GPU_ENABLED int CF2_ik(T v, T x, T* Kv, T* Kv1, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    using namespace boost::math::constants;
@@ -297,7 +301,7 @@ enum{
 // Compute I(v, x) and K(v, x) simultaneously by Temme's method, see
 // Temme, Journal of Computational Physics, vol 19, 324 (1975)
 template <typename T, typename Policy>
-int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol)
+BOOST_MATH_GPU_ENABLED int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol)
 {
    // Kv1 = K_(v+1), fv = I_(v+1) / I_v
    // Ku1 = K_(u+1), fu = I_(u+1) / I_u
@@ -314,7 +318,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol)
    using namespace boost::math::tools;
    using namespace boost::math::constants;

-    static const char* function = "boost::math::bessel_ik<%1%>(%1%,%1%)";
+    constexpr auto function = "boost::math::bessel_ik<%1%>(%1%,%1%)";

    if (v < 0)
    {
@@ -329,7 +333,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol)
    if (((kind & need_i) == 0) && (fabs(4 * v * v - 25) / (8 * x) < tools::forth_root_epsilon<T>()))
    {
       // A&S 9.7.2
-       Iv = std::numeric_limits<T>::quiet_NaN(); // any value will do
+       Iv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do
       T mu = 4 * v * v;
       T eight_z = 8 * x;
       Kv = 1 + (mu - 1) / eight_z + (mu - 1) * (mu - 9) / (2 * eight_z * eight_z) + (mu - 1) * (mu - 9) * (mu - 25) / (6 * eight_z * eight_z * eight_z);
@@ -410,7 +414,7 @@ int bessel_ik(T v, T x, T* result_I, T* result_K, int kind, const Policy& pol)
          }
       }
       else
-          Iv = std::numeric_limits<T>::quiet_NaN(); // any value will do
+          Iv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do
    }
    if (reflect)
    {
--- a/include/boost/math/special_functions/detail/bessel_j0.hpp
+++ b/include/boost/math/special_functions/detail/bessel_j0.hpp
@@ -10,6 +10,7 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/math/tools/rational.hpp>
 #include <boost/math/tools/big_constant.hpp>
@@ -32,10 +33,10 @@
 namespace boost { namespace math { namespace detail{

 template <typename T>
-T bessel_j0(T x);
+BOOST_MATH_GPU_ENABLED T bessel_j0(T x);

 template <typename T>
-T bessel_j0(T x)
+BOOST_MATH_GPU_ENABLED T bessel_j0(T x)
 {
 #ifdef BOOST_MATH_INSTRUMENT
    static bool b = false;
@@ -48,7 +49,7 @@ T bessel_j0(T x)
    }
 #endif

-    static const T P1[] = {
+    BOOST_MATH_STATIC const T P1[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -4.1298668500990866786e+11)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.7282507878605942706e+10)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -6.2140700423540120665e+08)),
@@ -57,7 +58,7 @@ T bessel_j0(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0344222815443188943e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2117036164593528341e-01))
    };
-    static const T Q1[] = {
+    BOOST_MATH_STATIC const T Q1[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.3883787996332290397e+12)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.6328198300859648632e+10)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.3985097372263433271e+08)),
@@ -66,7 +67,7 @@ T bessel_j0(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0))
    };
-    static const T P2[] = {
+    BOOST_MATH_STATIC const T P2[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8319397969392084011e+03)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2254078161378989535e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -7.2879702464464618998e+03)),
@@ -76,7 +77,7 @@ T bessel_j0(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.4321196680624245801e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.8591703355916499363e+01))
    };
-    static const T Q2[] = {
+    BOOST_MATH_STATIC const T Q2[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.5783478026152301072e+05)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4599102262586308984e+05)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.4055062591169562211e+04)),
@@ -86,7 +87,7 @@ T bessel_j0(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -2.5258076240801555057e+01)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
    };
-    static const T PC[] = {
+    BOOST_MATH_STATIC const T PC[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684302e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1345386639580765797e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1170523380864944322e+04)),
@@ -94,7 +95,7 @@ T bessel_j0(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5376201909008354296e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.8961548424210455236e-01))
    };
-    static const T QC[] = {
+    BOOST_MATH_STATIC const T QC[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684318e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1370412495510416640e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1215350561880115730e+04)),
@@ -102,7 +103,7 @@ T bessel_j0(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5711159858080893649e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
    };
-    static const T PS[] = {
+    BOOST_MATH_STATIC const T PS[] = {
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.9226600200800094098e+01)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8591953644342993800e+02)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1183429920482737611e+02)),
@@ -110,7 +111,7 @@ T bessel_j0(T x)
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2441026745835638459e+00)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.8033303048680751817e-03))
    };
-    static const T QS[] = {
+    BOOST_MATH_STATIC const T QS[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.7105024128512061905e+03)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1951131543434613647e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.2642780169211018836e+03)),
@@ -118,12 +119,13 @@ T bessel_j0(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 9.0593769594993125859e+01)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
    };
-    static const T x1  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4048255576957727686e+00)),
-                   x2  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5200781102863106496e+00)),
-                   x11 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.160e+02)),
-                   x12 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.42444230422723137837e-03)),
-                   x21 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4130e+03)),
-                   x22 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.46860286310649596604e-04));
+
+    BOOST_MATH_STATIC const T x1  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.4048255576957727686e+00));
+    BOOST_MATH_STATIC const T x2  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5200781102863106496e+00));
+    BOOST_MATH_STATIC const T x11 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.160e+02));
+    BOOST_MATH_STATIC const T x12 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.42444230422723137837e-03));
+    BOOST_MATH_STATIC const T x21 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4130e+03));
+    BOOST_MATH_STATIC const T x22 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.46860286310649596604e-04));

    T value, factor, r, rc, rs;

--- a/include/boost/math/special_functions/detail/bessel_j1.hpp
+++ b/include/boost/math/special_functions/detail/bessel_j1.hpp
@@ -10,6 +10,7 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/math/tools/rational.hpp>
 #include <boost/math/tools/big_constant.hpp>
@@ -32,27 +33,29 @@
 namespace boost { namespace math{  namespace detail{

 template <typename T>
-T bessel_j1(T x);
+BOOST_MATH_GPU_ENABLED T bessel_j1(T x);

 template <class T>
 struct bessel_j1_initializer
 {
   struct init
   {
-      init()
+      BOOST_MATH_GPU_ENABLED init()
      {
         do_init();
      }
-      static void do_init()
+      BOOST_MATH_GPU_ENABLED static void do_init()
      {
         bessel_j1(T(1));
      }
-      void force_instantiate()const{}
+      BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
   };
-   static const init initializer;
-   static void force_instantiate()
+   BOOST_MATH_STATIC const init initializer;
+   BOOST_MATH_GPU_ENABLED static void force_instantiate()
   {
+      #ifndef BOOST_MATH_HAS_GPU_SUPPORT
      initializer.force_instantiate();
+      #endif
   }
 };

@@ -60,11 +63,11 @@ template <class T>
 const typename bessel_j1_initializer<T>::init bessel_j1_initializer<T>::initializer;

 template <typename T>
-T bessel_j1(T x)
+BOOST_MATH_GPU_ENABLED T bessel_j1(T x)
 {
    bessel_j1_initializer<T>::force_instantiate();

-    static const T P1[] = {
+    BOOST_MATH_STATIC const T P1[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4258509801366645672e+11)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6781041261492395835e+09)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1548696764841276794e+08)),
@@ -73,7 +76,7 @@ T bessel_j1(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0650724020080236441e+01)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.0767857011487300348e-02))
    };
-    static const T Q1[] = {
+    BOOST_MATH_STATIC const T Q1[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1868604460820175290e+12)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.2091902282580133541e+10)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.0228375140097033958e+08)),
@@ -82,7 +85,7 @@ T bessel_j1(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0))
    };
-    static const T P2[] = {
+    BOOST_MATH_STATIC const T P2[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.7527881995806511112e+16)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.6608531731299018674e+15)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.6658018905416665164e+13)),
@@ -92,7 +95,7 @@ T bessel_j1(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -7.5023342220781607561e+03)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.6179191852758252278e+00))
    };
-    static const T Q2[] = {
+    BOOST_MATH_STATIC const T Q2[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7253905888447681194e+18)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7128800897135812012e+16)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.4899346165481429307e+13)),
@@ -102,7 +105,7 @@ T bessel_j1(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.3886978985861357615e+03)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
    };
-    static const T PC[] = {
+    BOOST_MATH_STATIC const T PC[] = {
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278571e+06)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9422465050776411957e+06)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -6.6033732483649391093e+06)),
@@ -111,7 +114,7 @@ T bessel_j1(T x)
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.6116166443246101165e+03)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0))
    };
-    static const T QC[] = {
+    BOOST_MATH_STATIC const T QC[] = {
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278568e+06)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9341243899345856590e+06)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -6.5853394797230870728e+06)),
@@ -120,7 +123,7 @@ T bessel_j1(T x)
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4550094401904961825e+03)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
    };
-    static const T PS[] = {
+    BOOST_MATH_STATIC const T PS[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.3220913409857223519e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.5145160675335701966e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6178836581270835179e+04)),
@@ -129,7 +132,7 @@ T bessel_j1(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.5265133846636032186e+01)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0))
    };
-    static const T QS[] = {
+    BOOST_MATH_STATIC const T QS[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0871281941028743574e+05)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8194580422439972989e+06)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4194606696037208929e+06)),
@@ -138,12 +141,13 @@ T bessel_j1(T x)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.6383677696049909675e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0))
    };
-    static const T x1  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.8317059702075123156e+00)),
-                   x2  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0155866698156187535e+00)),
-                   x11 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 9.810e+02)),
-                   x12 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.2527979248768438556e-04)),
-                   x21 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7960e+03)),
-                   x22 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.8330184381246462950e-05));
+
+    BOOST_MATH_STATIC const T x1  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.8317059702075123156e+00));
+    BOOST_MATH_STATIC const T x2  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0155866698156187535e+00));
+    BOOST_MATH_STATIC const T x11 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 9.810e+02));
+    BOOST_MATH_STATIC const T x12 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.2527979248768438556e-04));
+    BOOST_MATH_STATIC const T x21 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7960e+03));
+    BOOST_MATH_STATIC const T x22 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.8330184381246462950e-05));

    T value, factor, r, rc, rs, w;

--- a/include/boost/math/special_functions/detail/bessel_jn.hpp
+++ b/include/boost/math/special_functions/detail/bessel_jn.hpp
@@ -10,6 +10,10 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/assert.hpp>
+#include <boost/math/policies/error_handling.hpp>
+#include <boost/math/special_functions/gamma.hpp>
 #include <boost/math/special_functions/detail/bessel_j0.hpp>
 #include <boost/math/special_functions/detail/bessel_j1.hpp>
 #include <boost/math/special_functions/detail/bessel_jy.hpp>
@@ -24,7 +28,7 @@
 namespace boost { namespace math { namespace detail{

 template <typename T, typename Policy>
-T bessel_jn(int n, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T bessel_jn(int n, T x, const Policy& pol)
 {
    T value(0), factor, current, prev, next;

--- a/include/boost/math/special_functions/detail/bessel_jy.hpp
+++ b/include/boost/math/special_functions/detail/bessel_jy.hpp
@@ -11,16 +11,18 @@
 #endif

 #include <boost/math/tools/config.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/type_traits.hpp>
 #include <boost/math/special_functions/gamma.hpp>
 #include <boost/math/special_functions/sign.hpp>
 #include <boost/math/special_functions/hypot.hpp>
 #include <boost/math/special_functions/sin_pi.hpp>
 #include <boost/math/special_functions/cos_pi.hpp>
+#include <boost/math/special_functions/round.hpp>
 #include <boost/math/special_functions/detail/bessel_jy_asym.hpp>
 #include <boost/math/special_functions/detail/bessel_jy_series.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/math/policies/error_handling.hpp>
-#include <complex>

 // Bessel functions of the first and second kind of fractional order

@@ -38,7 +40,7 @@ namespace boost { namespace math {
      // try it and see...
      //
      template <class T, class Policy>
-      bool hankel_PQ(T v, T x, T* p, T* q, const Policy& )
+      BOOST_MATH_GPU_ENABLED bool hankel_PQ(T v, T x, T* p, T* q, const Policy& )
      {
         BOOST_MATH_STD_USING
            T tolerance = 2 * policies::get_epsilon<T, Policy>();
@@ -70,7 +72,7 @@ namespace boost { namespace math {
      // Calculate Y(v, x) and Y(v+1, x) by Temme's method, see
      // Temme, Journal of Computational Physics, vol 21, 343 (1976)
      template <typename T, typename Policy>
-      int temme_jy(T v, T x, T* Y, T* Y1, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED int temme_jy(T v, T x, T* Y, T* Y1, const Policy& pol)
      {
         T g, h, p, q, f, coef, sum, sum1, tolerance;
         T a, d, e, sigma;
@@ -139,7 +141,7 @@ namespace boost { namespace math {
      // Evaluate continued fraction fv = J_(v+1) / J_v, see
      // Abramowitz and Stegun, Handbook of Mathematical Functions, 1972, 9.1.73
      template <typename T, typename Policy>
-      int CF1_jy(T v, T x, T* fv, int* sign, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED int CF1_jy(T v, T x, T* fv, int* sign, const Policy& pol)
      {
         T C, D, f, a, b, delta, tiny, tolerance;
         unsigned long k;
@@ -185,7 +187,7 @@ namespace boost { namespace math {
      // real values only.
      //
      template <typename T, typename Policy>
-      int CF2_jy(T v, T x, T* p, T* q, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED int CF2_jy(T v, T x, T* p, T* q, const Policy& pol)
      {
         BOOST_MATH_STD_USING

@@ -254,13 +256,13 @@ namespace boost { namespace math {
         return 0;
      }

-      static const int need_j = 1;
-      static const int need_y = 2;
+      BOOST_MATH_STATIC const int need_j = 1;
+      BOOST_MATH_STATIC const int need_y = 2;

      // Compute J(v, x) and Y(v, x) simultaneously by Steed's method, see
      // Barnett et al, Computer Physics Communications, vol 8, 377 (1974)
      template <typename T, typename Policy>
-      int bessel_jy(T v, T x, T* J, T* Y, int kind, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED int bessel_jy(T v, T x, T* J, T* Y, int kind, const Policy& pol)
      {
         BOOST_MATH_ASSERT(x >= 0);

@@ -273,7 +275,7 @@ namespace boost { namespace math {
         T cp = 0;
         T sp = 0;

-         static const char* function = "boost::math::bessel_jy<%1%>(%1%,%1%)";
+         constexpr auto function = "boost::math::bessel_jy<%1%>(%1%,%1%)";

         BOOST_MATH_STD_USING
            using namespace boost::math::tools;
@@ -284,7 +286,7 @@ namespace boost { namespace math {
            reflect = true;
            v = -v;                             // v is non-negative from here
         }
-         if (v > static_cast<T>((std::numeric_limits<int>::max)()))
+         if (v > static_cast<T>((boost::math::numeric_limits<int>::max)()))
         {
            *J = *Y = policies::raise_evaluation_error<T>(function, "Order of Bessel function is too large to evaluate: got %1%", v, pol);
            return 1;  // LCOV_EXCL_LINE previous line will throw.
@@ -310,10 +312,10 @@ namespace boost { namespace math {
            else if(kind & need_j)
               *J = policies::raise_domain_error<T>(function, "Value of Bessel J_v(x) is complex-infinity at %1%", x, pol); // complex infinity
            else
-               *J = std::numeric_limits<T>::quiet_NaN();  // LCOV_EXCL_LINE, we should never get here, any value will do, not using J.
+               *J = boost::math::numeric_limits<T>::quiet_NaN();  // LCOV_EXCL_LINE, we should never get here, any value will do, not using J.

            if((kind & need_y) == 0)
-               *Y = std::numeric_limits<T>::quiet_NaN();  // any value will do, not using Y.
+               *Y = boost::math::numeric_limits<T>::quiet_NaN();  // any value will do, not using Y.
            else
            {
               // We shoud never get here:
@@ -333,7 +335,7 @@ namespace boost { namespace math {
            // and divergent which leads to large errors :-(
            //
            Jv = bessel_j_small_z_series(v, x, pol);
-            Yv = std::numeric_limits<T>::quiet_NaN();
+            Yv = boost::math::numeric_limits<T>::quiet_NaN();
         }
         else if((x < 1) && (u != 0) && (log(policies::get_epsilon<T, Policy>() / 2) > v * log((x/2) * (x/2) / v)))
         {
@@ -344,7 +346,7 @@ namespace boost { namespace math {
            if(kind&need_j)
               Jv = bessel_j_small_z_series(v, x, pol);
            else
-               Jv = std::numeric_limits<T>::quiet_NaN();
+               Jv = boost::math::numeric_limits<T>::quiet_NaN();
            if((org_kind&need_y && (!reflect || (cp != 0)))
               || (org_kind & need_j && (reflect && (sp != 0))))
            {
@@ -352,7 +354,7 @@ namespace boost { namespace math {
               Yv = bessel_y_small_z_series(v, x, &Yv_scale, pol);
            }
            else
-               Yv = std::numeric_limits<T>::quiet_NaN();
+               Yv = boost::math::numeric_limits<T>::quiet_NaN();
         }
         else if((u == 0) && (x < policies::get_epsilon<T, Policy>()))
         {
@@ -363,7 +365,7 @@ namespace boost { namespace math {
            if(kind&need_j)
               Jv = bessel_j_small_z_series(v, x, pol);
            else
-               Jv = std::numeric_limits<T>::quiet_NaN();
+               Jv = boost::math::numeric_limits<T>::quiet_NaN();
            if((org_kind&need_y && (!reflect || (cp != 0)))
               || (org_kind & need_j && (reflect && (sp != 0))))
            {
@@ -371,7 +373,7 @@ namespace boost { namespace math {
               Yv = bessel_yn_small_z(n, x, &Yv_scale, pol);
            }
            else
-               Yv = std::numeric_limits<T>::quiet_NaN();
+               Yv = boost::math::numeric_limits<T>::quiet_NaN();
            // LCOV_EXCL_STOP
         }
         else if(asymptotic_bessel_large_x_limit(v, x))
@@ -381,13 +383,13 @@ namespace boost { namespace math {
               Yv = asymptotic_bessel_y_large_x_2(v, x, pol);
            }
            else
-               Yv = std::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
+               Yv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
            if(kind&need_j)
            {
               Jv = asymptotic_bessel_j_large_x_2(v, x, pol);
            }
            else
-               Jv = std::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
+               Jv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
         }
         else if((x > 8) && hankel_PQ(v, x, &p, &q, pol))
         {
@@ -449,7 +451,7 @@ namespace boost { namespace math {
               Jv = scale * W / (Yv * fv - Yv1);           // Wronskian relation
            }
            else
-               Jv = std::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
+               Jv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
            Yv_scale = scale;
         }
         else                                    // x in (2, \infty)
@@ -564,7 +566,7 @@ namespace boost { namespace math {
               Yv = prev;
            }
            else
-               Yv = std::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
+               Yv = boost::math::numeric_limits<T>::quiet_NaN(); // any value will do, we're not using it.
         }

         if (reflect)
--- a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp
+++ b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp
@@ -16,12 +16,15 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
+#include <boost/math/constants/constants.hpp>
 #include <boost/math/special_functions/factorials.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>

 namespace boost{ namespace math{ namespace detail{

 template <class T>
-inline T asymptotic_bessel_amplitude(T v, T x)
+BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_amplitude(T v, T x)
 {
   // Calculate the amplitude of J(v, x) and Y(v, x) for large
   // x: see A&S 9.2.28.
@@ -39,7 +42,7 @@ inline T asymptotic_bessel_amplitude(T v, T x)
 }

 template <class T>
-T asymptotic_bessel_phase_mx(T v, T x)
+BOOST_MATH_GPU_ENABLED T asymptotic_bessel_phase_mx(T v, T x)
 {
   //
   // Calculate the phase of J(v, x) and Y(v, x) for large x.
@@ -63,7 +66,7 @@ T asymptotic_bessel_phase_mx(T v, T x)
 }

 template <class T, class Policy>
-inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol)
 {
   // See A&S 9.2.19.
   BOOST_MATH_STD_USING
@@ -93,7 +96,7 @@ inline T asymptotic_bessel_y_large_x_2(T v, T x, const Policy& pol)
 }

 template <class T, class Policy>
-inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol)
 {
   // See A&S 9.2.19.
   BOOST_MATH_STD_USING
@@ -124,7 +127,7 @@ inline T asymptotic_bessel_j_large_x_2(T v, T x, const Policy& pol)
 }

 template <class T>
-inline bool asymptotic_bessel_large_x_limit(int v, const T& x)
+BOOST_MATH_GPU_ENABLED inline bool asymptotic_bessel_large_x_limit(int v, const T& x)
 {
   BOOST_MATH_STD_USING
      //
@@ -142,7 +145,7 @@ inline bool asymptotic_bessel_large_x_limit(int v, const T& x)
 }

 template <class T>
-inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x)
+BOOST_MATH_GPU_ENABLED inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x)
 {
   BOOST_MATH_STD_USING
   //
@@ -155,11 +158,11 @@ inline bool asymptotic_bessel_large_x_limit(const T& v, const T& x)
   // error rates either side of the divide for v < 10000.
   // At double precision eps^1/8 ~= 0.01.
   //
-   return (std::max)(T(fabs(v)), T(1)) < x * sqrt(tools::forth_root_epsilon<T>());
+   return BOOST_MATH_GPU_SAFE_MAX(T(fabs(v)), T(1)) < x * sqrt(tools::forth_root_epsilon<T>());
 }

 template <class T, class Policy>
-void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol)
+BOOST_MATH_GPU_ENABLED void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol)
 {
   T c = 1;
   T p = (v / boost::math::sin_pi(v, pol)) * pow(x / 2, -v) / boost::math::tgamma(1 - v, pol);
@@ -193,7 +196,7 @@ void temme_asymptotic_y_small_x(T v, T x, T* Y, T* Y1, const Policy& pol)
 }

 template <class T, class Policy>
-T asymptotic_bessel_i_large_x(T v, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T asymptotic_bessel_i_large_x(T v, T x, const Policy& pol)
 {
   BOOST_MATH_STD_USING  // ADL of std names
   T s = 1;
--- a/include/boost/math/special_functions/detail/bessel_jy_series.hpp
+++ b/include/boost/math/special_functions/detail/bessel_jy_series.hpp
@@ -10,10 +10,9 @@
 #pragma once
 #endif

-#include <cmath>
-#include <cstdint>
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/assert.hpp>
+#include <boost/math/tools/cstdint.hpp>

 namespace boost { namespace math { namespace detail{

@@ -22,7 +21,7 @@ struct bessel_j_small_z_series_term
 {
   typedef T result_type;

-   bessel_j_small_z_series_term(T v_, T x)
+   BOOST_MATH_GPU_ENABLED bessel_j_small_z_series_term(T v_, T x)
      : N(0), v(v_)
   {
      BOOST_MATH_STD_USING
@@ -30,7 +29,7 @@ struct bessel_j_small_z_series_term
      mult *= -mult;
      term = 1;
   }
-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
   {
      T r = term;
      ++N;
@@ -49,7 +48,7 @@ private:
 // Converges rapidly for all z << v.
 //
 template <class T, class Policy>
-inline T bessel_j_small_z_series(T v, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T bessel_j_small_z_series(T v, T x, const Policy& pol)
 {
   BOOST_MATH_STD_USING
   T prefix;
@@ -66,7 +65,7 @@ inline T bessel_j_small_z_series(T v, T x, const Policy& pol)
      return prefix;

   bessel_j_small_z_series_term<T, Policy> s(v, x);
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();

   T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon<T, Policy>(), max_iter);

@@ -79,7 +78,7 @@ struct bessel_y_small_z_series_term_a
 {
   typedef T result_type;

-   bessel_y_small_z_series_term_a(T v_, T x)
+   BOOST_MATH_GPU_ENABLED bessel_y_small_z_series_term_a(T v_, T x)
      : N(0), v(v_)
   {
      BOOST_MATH_STD_USING
@@ -87,7 +86,7 @@ struct bessel_y_small_z_series_term_a
      mult *= -mult;
      term = 1;
   }
-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
   {
      BOOST_MATH_STD_USING
      T r = term;
@@ -107,7 +106,7 @@ struct bessel_y_small_z_series_term_b
 {
   typedef T result_type;

-   bessel_y_small_z_series_term_b(T v_, T x)
+   BOOST_MATH_GPU_ENABLED bessel_y_small_z_series_term_b(T v_, T x)
      : N(0), v(v_)
   {
      BOOST_MATH_STD_USING
@@ -115,7 +114,7 @@ struct bessel_y_small_z_series_term_b
      mult *= -mult;
      term = 1;
   }
-   T operator()()
+   BOOST_MATH_GPU_ENABLED T operator()()
   {
      T r = term;
      ++N;
@@ -138,10 +137,10 @@ private:
 // eps/2 * v^v(x/2)^-v > (x/2)^v or log(eps/2) > v log((x/2)^2/v)
 //
 template <class T, class Policy>
-inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol)
 {
   BOOST_MATH_STD_USING
-   static const char* function = "bessel_y_small_z_series<%1%>(%1%,%1%)";
+   constexpr auto function = "bessel_y_small_z_series<%1%>(%1%,%1%)";
   T prefix;
   T gam;
   T p = log(x / 2);
@@ -183,7 +182,7 @@ inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol)
      prefix = -exp(prefix);
   }
   bessel_y_small_z_series_term_a<T, Policy> s(v, x);
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
   *pscale = scale;

   T result = boost::math::tools::sum_series(s, boost::math::policies::get_epsilon<T, Policy>(), max_iter);
@@ -211,7 +210,7 @@ inline T bessel_y_small_z_series(T v, T x, T* pscale, const Policy& pol)
 }

 template <class T, class Policy>
-T bessel_yn_small_z(int n, T z, T* scale, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T bessel_yn_small_z(int n, T z, T* scale, const Policy& pol)
 {
   //
   // See http://functions.wolfram.com/Bessel-TypeFunctions/BesselY/06/01/04/01/02/
--- a/include/boost/math/special_functions/detail/bessel_jy_zero.hpp
+++ b/include/boost/math/special_functions/detail/bessel_jy_zero.hpp
@@ -18,19 +18,30 @@
 #ifndef BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_
  #define BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_

-  #include <algorithm>
+  #include <boost/math/tools/config.hpp>
+  #include <boost/math/tools/tuple.hpp>
+  #include <boost/math/tools/precision.hpp>
+  #include <boost/math/tools/cstdint.hpp>
+  #include <boost/math/tools/roots.hpp>
  #include <boost/math/constants/constants.hpp>
-  #include <boost/math/special_functions/math_fwd.hpp>
  #include <boost/math/special_functions/cbrt.hpp>
  #include <boost/math/special_functions/detail/airy_ai_bi_zero.hpp>

+  #ifndef BOOST_MATH_HAS_NVRTC
+  #include <boost/math/special_functions/math_fwd.hpp>
+  #endif
+
+  #ifdef BOOST_MATH_ENABLE_CUDA
+  #  pragma nv_diag_suppress 20012
+  #endif
+
  namespace boost { namespace math {
  namespace detail
  {
    namespace bessel_zero
    {
      template<class T>
-      T equation_nist_10_21_19(const T& v, const T& a)
+      BOOST_MATH_GPU_ENABLED T equation_nist_10_21_19(const T& v, const T& a)
      {
        // Get the initial estimate of the m'th root of Jv or Yv.
        // This subroutine is used for the order m with m > 1.
@@ -57,11 +68,11 @@
      class equation_as_9_3_39_and_its_derivative
      {
      public:
-        explicit equation_as_9_3_39_and_its_derivative(const T& zt) : zeta(zt) { }
+        BOOST_MATH_GPU_ENABLED explicit equation_as_9_3_39_and_its_derivative(const T& zt) : zeta(zt) { }

-        equation_as_9_3_39_and_its_derivative(const equation_as_9_3_39_and_its_derivative&) = default;
+        BOOST_MATH_GPU_ENABLED equation_as_9_3_39_and_its_derivative(const equation_as_9_3_39_and_its_derivative&) = default;

-        boost::math::tuple<T, T> operator()(const T& z) const
+        BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(const T& z) const
        {
          BOOST_MATH_STD_USING // ADL of std names, needed for acos, sqrt.

@@ -86,7 +97,7 @@
      };

      template<class T, class Policy>
-      static T equation_as_9_5_26(const T& v, const T& ai_bi_root, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED T equation_as_9_5_26(const T& v, const T& ai_bi_root, const Policy& pol)
      {
        BOOST_MATH_STD_USING // ADL of std names, needed for log, sqrt.

@@ -132,9 +143,9 @@

        // Select the maximum allowed iterations based on the number
        // of decimal digits in the numeric type T, being at least 12.
-        const auto iterations_allowed = static_cast<std::uintmax_t>((std::max)(12, my_digits10 * 2));
+        const auto iterations_allowed = static_cast<boost::math::uintmax_t>(BOOST_MATH_GPU_SAFE_MAX(12, my_digits10 * 2));

-        std::uintmax_t iterations_used = iterations_allowed;
+        boost::math::uintmax_t iterations_used = iterations_allowed;

        // Calculate the root of z as a function of zeta.
        const T z = boost::math::tools::newton_raphson_iterate(
@@ -142,7 +153,7 @@
          z_estimate,
          range_zmin,
          range_zmax,
-          (std::min)(boost::math::tools::digits<T>(), boost::math::tools::digits<float>()),
+          BOOST_MATH_GPU_SAFE_MIN(boost::math::tools::digits<T>(), boost::math::tools::digits<float>()),
          iterations_used);

        static_cast<void>(iterations_used);
@@ -168,7 +179,7 @@
      namespace cyl_bessel_j_zero_detail
      {
        template<class T, class Policy>
-        T equation_nist_10_21_40_a(const T& v, const Policy& pol)
+        BOOST_MATH_GPU_ENABLED T equation_nist_10_21_40_a(const T& v, const Policy& pol)
        {
          const T v_pow_third(boost::math::cbrt(v, pol));
          const T v_pow_minus_two_thirds(T(1) / (v_pow_third * v_pow_third));
@@ -185,13 +196,13 @@
        class function_object_jv
        {
        public:
-          function_object_jv(const T& v,
+          BOOST_MATH_GPU_ENABLED function_object_jv(const T& v,
                             const Policy& pol) : my_v(v),
                                                  my_pol(pol) { }

-          function_object_jv(const function_object_jv&) = default;
+          BOOST_MATH_GPU_ENABLED function_object_jv(const function_object_jv&) = default;

-          T operator()(const T& x) const
+          BOOST_MATH_GPU_ENABLED T operator()(const T& x) const
          {
            return boost::math::cyl_bessel_j(my_v, x, my_pol);
          }
@@ -206,15 +217,16 @@
        class function_object_jv_and_jv_prime
        {
        public:
-          function_object_jv_and_jv_prime(const T& v,
-                                          const bool order_is_zero,
-                                          const Policy& pol) : my_v(v),
+          BOOST_MATH_GPU_ENABLED function_object_jv_and_jv_prime(
+                                                         const T& v,
+                                                         const bool order_is_zero,
+                                                         const Policy& pol) : my_v(v),
                                                               my_order_is_zero(order_is_zero),
                                                               my_pol(pol) { }

          function_object_jv_and_jv_prime(const function_object_jv_and_jv_prime&) = default;

-          boost::math::tuple<T, T> operator()(const T& x) const
+          BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(const T& x) const
          {
            // Obtain Jv(x) and Jv'(x).
            // Chris's original code called the Bessel function implementation layer direct, 
@@ -246,10 +258,10 @@
          const function_object_jv_and_jv_prime& operator=(const function_object_jv_and_jv_prime&) = delete;
        };

-        template<class T> bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; }
+        template<class T> BOOST_MATH_GPU_ENABLED bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; }

        template<class T, class Policy>
-        T initial_guess(const T& v, const int m, const Policy& pol)
+        BOOST_MATH_GPU_ENABLED T initial_guess(const T& v, const int m, const Policy& pol)
        {
          BOOST_MATH_STD_USING // ADL of std names, needed for floor.

@@ -325,7 +337,7 @@
            }

            // Perform several steps of bisection iteration to refine the guess.
-            std::uintmax_t number_of_iterations(12U);
+            boost::math::uintmax_t number_of_iterations(12U);

            // Do the bisection iteration.
            const boost::math::tuple<T, T> guess_pair =
@@ -390,7 +402,7 @@
      namespace cyl_neumann_zero_detail
      {
        template<class T, class Policy>
-        T equation_nist_10_21_40_b(const T& v, const Policy& pol)
+        BOOST_MATH_GPU_ENABLED T equation_nist_10_21_40_b(const T& v, const Policy& pol)
        {
          const T v_pow_third(boost::math::cbrt(v, pol));
          const T v_pow_minus_two_thirds(T(1) / (v_pow_third * v_pow_third));
@@ -407,13 +419,13 @@
        class function_object_yv
        {
        public:
-          function_object_yv(const T& v,
-                             const Policy& pol) : my_v(v),
-                                                  my_pol(pol) { }
+          BOOST_MATH_GPU_ENABLED function_object_yv(const T& v,
+                                                    const Policy& pol) : my_v(v),
+                                                                         my_pol(pol) { }

-          function_object_yv(const function_object_yv&) = default;
+          BOOST_MATH_GPU_ENABLED function_object_yv(const function_object_yv&) = default;

-          T operator()(const T& x) const
+          BOOST_MATH_GPU_ENABLED T operator()(const T& x) const
          {
            return boost::math::cyl_neumann(my_v, x, my_pol);
          }
@@ -428,13 +440,13 @@
        class function_object_yv_and_yv_prime
        {
        public:
-          function_object_yv_and_yv_prime(const T& v,
-                                          const Policy& pol) : my_v(v),
-                                                               my_pol(pol) { }
+          BOOST_MATH_GPU_ENABLED function_object_yv_and_yv_prime(const T& v,
+                                                                 const Policy& pol) : my_v(v),
+                                                                                      my_pol(pol) { }

-          function_object_yv_and_yv_prime(const function_object_yv_and_yv_prime&) = default;
+          BOOST_MATH_GPU_ENABLED function_object_yv_and_yv_prime(const function_object_yv_and_yv_prime&) = default;

-          boost::math::tuple<T, T> operator()(const T& x) const
+          BOOST_MATH_GPU_ENABLED boost::math::tuple<T, T> operator()(const T& x) const
          {
            const T half_epsilon(boost::math::tools::epsilon<T>() / 2U);

@@ -469,10 +481,10 @@
          const function_object_yv_and_yv_prime& operator=(const function_object_yv_and_yv_prime&) = delete;
        };

-        template<class T> bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; }
+        template<class T> BOOST_MATH_GPU_ENABLED bool my_bisection_unreachable_tolerance(const T&, const T&) { return false; }

        template<class T, class Policy>
-        T initial_guess(const T& v, const int m, const Policy& pol)
+        BOOST_MATH_GPU_ENABLED T initial_guess(const T& v, const int m, const Policy& pol)
        {
          BOOST_MATH_STD_USING // ADL of std names, needed for floor.

@@ -560,7 +572,7 @@
            }

            // Perform several steps of bisection iteration to refine the guess.
-            std::uintmax_t number_of_iterations(12U);
+            boost::math::uintmax_t number_of_iterations(12U);

            // Do the bisection iteration.
            const boost::math::tuple<T, T> guess_pair =
@@ -624,4 +636,8 @@
    } // namespace bessel_zero
  } } } // namespace boost::math::detail

+  #ifdef BOOST_MATH_ENABLE_CUDA
+  #  pragma nv_diag_default 20012
+  #endif
+
 #endif // BOOST_MATH_BESSEL_JY_ZERO_2013_01_18_HPP_
--- a/include/boost/math/special_functions/detail/bessel_k0.hpp
+++ b/include/boost/math/special_functions/detail/bessel_k0.hpp
@@ -13,10 +13,14 @@
 #pragma warning(disable:4702) // Unreachable code (release mode only warning)
 #endif

+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/precision.hpp>
 #include <boost/math/tools/rational.hpp>
 #include <boost/math/tools/big_constant.hpp>
-#include <boost/math/policies/error_handling.hpp>
 #include <boost/math/tools/assert.hpp>
+#include <boost/math/policies/error_handling.hpp>

 #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
 //
@@ -44,35 +48,37 @@
 namespace boost { namespace math { namespace detail{

 template <typename T>
-T bessel_k0(const T& x);
+BOOST_MATH_GPU_ENABLED T bessel_k0(const T& x);

 template <class T, class tag>
 struct bessel_k0_initializer
 {
   struct init
   {
-      init()
+      BOOST_MATH_GPU_ENABLED init()
      {
         do_init(tag());
      }
-      static void do_init(const std::integral_constant<int, 113>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
      {
         bessel_k0(T(0.5));
         bessel_k0(T(1.5));
      }
-      static void do_init(const std::integral_constant<int, 64>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
      {
         bessel_k0(T(0.5));
         bessel_k0(T(1.5));
      }
      template <class U>
-      static void do_init(const U&){}
-      void force_instantiate()const{}
+      BOOST_MATH_GPU_ENABLED static void do_init(const U&){}
+      BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
   };
-   static const init initializer;
-   static void force_instantiate()
+   BOOST_MATH_STATIC const init initializer;
+   BOOST_MATH_GPU_ENABLED static void force_instantiate()
   {
+      #ifndef BOOST_MATH_HAS_GPU_SUPPORT
      initializer.force_instantiate();
+      #endif
   }
 };

@@ -81,14 +87,14 @@ const typename bessel_k0_initializer<T, tag>::init bessel_k0_initializer<T, tag>


 template <typename T, int N>
-T bessel_k0_imp(const T&, const std::integral_constant<int, N>&)
+BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T&, const boost::math::integral_constant<int, N>&)
 {
   BOOST_MATH_ASSERT(0);
   return 0;
 }

 template <typename T>
-T bessel_k0_imp(const T& x, const std::integral_constant<int, 24>&)
+BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant<int, 24>&)
 {
   BOOST_MATH_STD_USING
   if(x <= 1)
@@ -97,14 +103,14 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 24>&)
      // Expected Error Term : -2.358e-09
      // Maximum Relative Change in Control Points : 9.552e-02
      // Max Error found at float precision = Poly : 4.448220e-08
-      static const T Y = 1.137250900268554688f;
-      static const T P[] = 
+      BOOST_MATH_STATIC const T Y = 1.137250900268554688f;
+      BOOST_MATH_STATIC const T P[] = 
      {
         -1.372508979104259711e-01f,
         2.622545986273687617e-01f,
         5.047103728247919836e-03f
      };
-      static const T Q[] = 
+      BOOST_MATH_STATIC const T Q[] = 
      {
         1.000000000000000000e+00f,
         -8.928694018000029415e-02f,
@@ -117,7 +123,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 24>&)
      // Expected Error Term : -1.343e-09
      // Maximum Relative Change in Control Points : 2.405e-02
      // Max Error found at float precision = Poly : 1.354814e-07
-      static const T P2[] = {
+      BOOST_MATH_STATIC const T P2[] = {
         1.159315158e-01f,
         2.789828686e-01f,
         2.524902861e-02f,
@@ -133,14 +139,14 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 24>&)
      // Maximum Relative Change in Control Points : 9.064e-02
      // Max Error found at float precision = Poly : 5.065020e-08

-      static const T P[] =
+      BOOST_MATH_STATIC const T P[] =
      {
         2.533141220e-01f,
         5.221502603e-01f,
         6.380180669e-02f,
         -5.934976547e-02f
      };
-      static const T Q[] =
+      BOOST_MATH_STATIC const T Q[] =
      {
         1.000000000e+00f,
         2.679722431e+00f,
@@ -158,7 +164,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 24>&)
 }

 template <typename T>
-T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
+BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant<int, 53>&)
 {
   BOOST_MATH_STD_USING
   if(x <= 1)
@@ -167,8 +173,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
      // Expected Error Term : -6.077e-17
      // Maximum Relative Change in Control Points : 7.797e-02
      // Max Error found at double precision = Poly : 1.003156e-16
-      static const T Y = 1.137250900268554688;
-      static const T P[] =
+      BOOST_MATH_STATIC const T Y = 1.137250900268554688;
+      BOOST_MATH_STATIC const T P[] =
      {
         -1.372509002685546267e-01,
         2.574916117833312855e-01,
@@ -176,7 +182,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
         5.445476986653926759e-04,
         7.125159422136622118e-06
      };
-      static const T Q[] =
+      BOOST_MATH_STATIC const T Q[] =
      {
         1.000000000000000000e+00,
         -5.458333438017788530e-02,
@@ -191,7 +197,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
      // Expected Error Term : 3.392e-18
      // Maximum Relative Change in Control Points : 2.041e-02
      // Max Error found at double precision = Poly : 2.513112e-16
-      static const T P2[] =
+      BOOST_MATH_STATIC const T P2[] =
      {
         1.159315156584124484e-01,
         2.789828789146031732e-01,
@@ -212,8 +218,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
      // Maximum Relative Change in Control Points : 2.757e-01
      // Max Error found at double precision = Poly : 1.001560e-16

-      static const T Y = 1;
-      static const T P[] =
+      BOOST_MATH_STATIC const T Y = 1;
+      BOOST_MATH_STATIC const T P[] =
      {
         2.533141373155002416e-01,
         3.628342133984595192e+00,
@@ -225,7 +231,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
         -1.414237994269995877e+00,
         -9.369168119754924625e-02
      };
-      static const T Q[] =
+      BOOST_MATH_STATIC const T Q[] =
      {
         1.000000000000000000e+00,
         1.494194694879908328e+01,
@@ -248,7 +254,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 53>&)
 }

 template <typename T>
-T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
+BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant<int, 64>&)
 {
   BOOST_MATH_STD_USING
      if(x <= 1)
@@ -257,8 +263,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
         // Expected Error Term : 2.180e-22
         // Maximum Relative Change in Control Points : 2.943e-01
         // Max Error found at float80 precision = Poly : 3.923207e-20
-         static const T Y = 1.137250900268554687500e+00;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 1.137250900268554687500e+00;
+         BOOST_MATH_STATIC const T P[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, -1.372509002685546875002e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, 2.566481981037407600436e-01),
@@ -267,7 +273,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
            BOOST_MATH_BIG_CONSTANT(T, 64, 1.213747930378196492543e-05),
            BOOST_MATH_BIG_CONSTANT(T, 64, 9.423709328020389560844e-08)
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 64, -4.843828412587773008342e-02),
@@ -284,7 +290,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
         // Expected Error Term : -2.434e-21
         // Maximum Relative Change in Control Points : 2.459e-02
         // Max Error found at float80 precision = Poly : 1.482487e-19
-         static const T P2[] =
+         BOOST_MATH_STATIC const T P2[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, 1.159315156584124488110e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, 2.764832791416047889734e-01),
@@ -292,7 +298,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
            BOOST_MATH_BIG_CONSTANT(T, 64, 3.660777862036966089410e-04),
            BOOST_MATH_BIG_CONSTANT(T, 64, 2.094942446930673386849e-06)
         };
-         static const T Q2[] =
+         BOOST_MATH_STATIC const T Q2[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 64, -2.156100313881251616320e-02),
@@ -308,8 +314,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
         // Expected Error Term : 2.236e-21
         // Maximum Relative Change in Control Points : 3.021e-01
         //Max Error found at float80 precision = Poly : 8.727378e-20
-         static const T Y = 1;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 1;
+         BOOST_MATH_STATIC const T P[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, 2.533141373155002512056e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, 5.417942070721928652715e+00),
@@ -323,7 +329,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
            BOOST_MATH_BIG_CONSTANT(T, 64, -4.059789241612946683713e+00),
            BOOST_MATH_BIG_CONSTANT(T, 64, -1.612783121537333908889e-01)
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 64, 2.200669254769325861404e+01),
@@ -348,7 +354,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 64>&)
 }

 template <typename T>
-T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
+BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant<int, 113>&)
 {
   BOOST_MATH_STD_USING
      if(x <= 1)
@@ -357,8 +363,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
         // Expected Error Term : 5.682e-37
         // Maximum Relative Change in Control Points : 6.094e-04
         // Max Error found at float128 precision = Poly : 5.338213e-35
-         static const T Y = 1.137250900268554687500000000000000000e+00f;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 1.137250900268554687500000000000000000e+00f;
+         BOOST_MATH_STATIC const T P[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, -1.372509002685546875000000000000000006e-01),
            BOOST_MATH_BIG_CONSTANT(T, 113, 2.556212905071072782462974351698081303e-01),
@@ -369,7 +375,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
            BOOST_MATH_BIG_CONSTANT(T, 113, 1.752489221949580551692915881999762125e-09),
            BOOST_MATH_BIG_CONSTANT(T, 113, 5.243010555737173524710512824955368526e-12)
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 113, -4.095631064064621099785696980653193721e-02),
@@ -387,7 +393,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
         // Expected Error Term : 5.105e-38
         // Maximum Relative Change in Control Points : 9.734e-03
         // Max Error found at float128 precision = Poly : 1.688806e-34
-         static const T P2[] =
+         BOOST_MATH_STATIC const T P2[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, 1.159315156584124488107200313757741370e-01),
            BOOST_MATH_BIG_CONSTANT(T, 113, 2.789828789146031122026800078439435369e-01),
@@ -413,8 +419,8 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
         // Expected Error Term : 4.917e-40
         // Maximum Relative Change in Control Points : 3.385e-01
         // Max Error found at float128 precision = Poly : 1.567573e-34
-         static const T Y = 1;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 1;
+         BOOST_MATH_STATIC const T P[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, 2.533141373155002512078826424055226265e-01),
            BOOST_MATH_BIG_CONSTANT(T, 113, 2.001949740768235770078339977110749204e+01),
@@ -439,7 +445,7 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
            BOOST_MATH_BIG_CONSTANT(T, 113, -4.201632288615609937883545928660649813e+03),
            BOOST_MATH_BIG_CONSTANT(T, 113, -3.690820607338480548346746717311811406e+01)
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 113, 7.964877874035741452203497983642653107e+01),
@@ -475,33 +481,33 @@ T bessel_k0_imp(const T& x, const std::integral_constant<int, 113>&)
 }

 template <typename T>
-T bessel_k0_imp(const T& x, const std::integral_constant<int, 0>&)
+BOOST_MATH_GPU_ENABLED T bessel_k0_imp(const T& x, const boost::math::integral_constant<int, 0>&)
 {
   if(boost::math::tools::digits<T>() <= 24)
-      return bessel_k0_imp(x, std::integral_constant<int, 24>());
+      return bessel_k0_imp(x, boost::math::integral_constant<int, 24>());
   else if(boost::math::tools::digits<T>() <= 53)
-      return bessel_k0_imp(x, std::integral_constant<int, 53>());
+      return bessel_k0_imp(x, boost::math::integral_constant<int, 53>());
   else if(boost::math::tools::digits<T>() <= 64)
-      return bessel_k0_imp(x, std::integral_constant<int, 64>());
+      return bessel_k0_imp(x, boost::math::integral_constant<int, 64>());
   else if(boost::math::tools::digits<T>() <= 113)
-      return bessel_k0_imp(x, std::integral_constant<int, 113>());
+      return bessel_k0_imp(x, boost::math::integral_constant<int, 113>());
   BOOST_MATH_ASSERT(0);
   return 0;
 }

 template <typename T>
-inline T bessel_k0(const T& x)
+BOOST_MATH_GPU_ENABLED inline T bessel_k0(const T& x)
 {
-   typedef std::integral_constant<int,
-      ((std::numeric_limits<T>::digits == 0) || (std::numeric_limits<T>::radix != 2)) ?
+   typedef boost::math::integral_constant<int,
+      ((boost::math::numeric_limits<T>::digits == 0) || (boost::math::numeric_limits<T>::radix != 2)) ?
      0 :
-      std::numeric_limits<T>::digits <= 24 ?
+      boost::math::numeric_limits<T>::digits <= 24 ?
      24 :
-      std::numeric_limits<T>::digits <= 53 ?
+      boost::math::numeric_limits<T>::digits <= 53 ?
      53 :
-      std::numeric_limits<T>::digits <= 64 ?
+      boost::math::numeric_limits<T>::digits <= 64 ?
      64 :
-      std::numeric_limits<T>::digits <= 113 ?
+      boost::math::numeric_limits<T>::digits <= 113 ?
      113 : -1
   > tag_type;

--- a/include/boost/math/special_functions/detail/bessel_k1.hpp
+++ b/include/boost/math/special_functions/detail/bessel_k1.hpp
@@ -13,6 +13,10 @@
 #pragma warning(disable:4702) // Unreachable code (release mode only warning)
 #endif

+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/precision.hpp>
 #include <boost/math/tools/rational.hpp>
 #include <boost/math/tools/big_constant.hpp>
 #include <boost/math/policies/error_handling.hpp>
@@ -44,36 +48,38 @@
 namespace boost { namespace math { namespace detail{

   template <typename T>
-   T bessel_k1(const T&);
+   BOOST_MATH_GPU_ENABLED T bessel_k1(const T&);

   template <class T, class tag>
   struct bessel_k1_initializer
   {
      struct init
      {
-         init()
+         BOOST_MATH_GPU_ENABLED init()
         {
            do_init(tag());
         }
-         static void do_init(const std::integral_constant<int, 113>&)
+         BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
         {
            bessel_k1(T(0.5));
            bessel_k1(T(2));
            bessel_k1(T(6));
         }
-         static void do_init(const std::integral_constant<int, 64>&)
+         BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
         {
            bessel_k1(T(0.5));
            bessel_k1(T(6));
         }
         template <class U>
-         static void do_init(const U&) {}
-         void force_instantiate()const {}
+         BOOST_MATH_GPU_ENABLED static void do_init(const U&) {}
+         BOOST_MATH_GPU_ENABLED void force_instantiate()const {}
      };
-      static const init initializer;
-      static void force_instantiate()
+      BOOST_MATH_STATIC const init initializer;
+      BOOST_MATH_GPU_ENABLED static void force_instantiate()
      {
+         #ifndef BOOST_MATH_HAS_GPU_SUPPORT
         initializer.force_instantiate();
+         #endif
      }
   };

@@ -82,14 +88,14 @@ namespace boost { namespace math { namespace detail{


   template <typename T, int N>
-   inline T bessel_k1_imp(const T&, const std::integral_constant<int, N>&)
+   inline BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T&, const boost::math::integral_constant<int, N>&)
   {
      BOOST_MATH_ASSERT(0);
      return 0;
   }

   template <typename T>
-   T bessel_k1_imp(const T& x, const std::integral_constant<int, 24>&)
+   BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant<int, 24>&)
   {
      BOOST_MATH_STD_USING
      if(x <= 1)
@@ -98,14 +104,14 @@ namespace boost { namespace math { namespace detail{
         // Expected Error Term : -3.053e-12
         // Maximum Relative Change in Control Points : 4.927e-02
         // Max Error found at float precision = Poly : 7.918347e-10
-         static const T Y = 8.695471287e-02f;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 8.695471287e-02f;
+         BOOST_MATH_STATIC const T P[] =
         {
            -3.621379531e-03f,
            7.131781976e-03f,
            -1.535278300e-05f
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            1.000000000e+00f,
            -5.173102701e-02f,
@@ -118,7 +124,7 @@ namespace boost { namespace math { namespace detail{
         // Maximum Deviation Found:                     3.556e-08
         // Expected Error Term : -3.541e-08
         // Maximum Relative Change in Control Points : 8.203e-02
-         static const T P2[] =
+         BOOST_MATH_STATIC const T P2[] =
         {
            -3.079657469e-01f,
            -8.537108913e-02f,
@@ -134,15 +140,15 @@ namespace boost { namespace math { namespace detail{
         // Expected Error Term : -3.227e-08
         // Maximum Relative Change in Control Points : 9.917e-02
         // Max Error found at float precision = Poly : 6.084411e-08
-         static const T Y = 1.450342178f;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 1.450342178f;
+         BOOST_MATH_STATIC const T P[] =
         {
            -1.970280088e-01f,
            2.188747807e-02f,
            7.270394756e-01f,
            2.490678196e-01f
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            1.000000000e+00f,
            2.274292882e+00f,
@@ -160,7 +166,7 @@ namespace boost { namespace math { namespace detail{
   }

   template <typename T>
-   T bessel_k1_imp(const T& x, const std::integral_constant<int, 53>&)
+   BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant<int, 53>&)
   {
      BOOST_MATH_STD_USING
      if(x <= 1)
@@ -169,15 +175,15 @@ namespace boost { namespace math { namespace detail{
         // Expected Error Term : 1.921e-17
         // Maximum Relative Change in Control Points : 5.287e-03
         // Max Error found at double precision = Poly : 2.004747e-17
-         static const T Y = 8.69547128677368164e-02f;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 8.69547128677368164e-02f;
+         BOOST_MATH_STATIC const T P[] =
         {
            -3.62137953440350228e-03,
            7.11842087490330300e-03,
            1.00302560256614306e-05,
            1.77231085381040811e-06
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            1.00000000000000000e+00,
            -4.80414794429043831e-02,
@@ -193,14 +199,14 @@ namespace boost { namespace math { namespace detail{
         // Maximum Relative Change in Control Points : 3.103e-04
         // Max Error found at double precision = Poly : 1.246698e-16

-         static const T P2[] =
+         BOOST_MATH_STATIC const T P2[] =
         {
            -3.07965757829206184e-01,
            -7.80929703673074907e-02,
            -2.70619343754051620e-03,
            -2.49549522229072008e-05
         };
-         static const T Q2[] = 
+         BOOST_MATH_STATIC const T Q2[] = 
         {
            1.00000000000000000e+00,
            -2.36316836412163098e-02,
@@ -217,8 +223,8 @@ namespace boost { namespace math { namespace detail{
         // Maximum Relative Change in Control Points : 2.786e-01
         // Max Error found at double precision = Poly : 1.258798e-16

-         static const T Y = 1.45034217834472656f;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 1.45034217834472656f;
+         BOOST_MATH_STATIC const T P[] =
         {
            -1.97028041029226295e-01,
            -2.32408961548087617e+00,
@@ -230,7 +236,7 @@ namespace boost { namespace math { namespace detail{
            6.62582288933739787e+00,
            3.08851840645286691e-01
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            1.00000000000000000e+00,
            1.41811409298826118e+01,
@@ -253,7 +259,7 @@ namespace boost { namespace math { namespace detail{
   }

   template <typename T>
-   T bessel_k1_imp(const T& x, const std::integral_constant<int, 64>&)
+   BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant<int, 64>&)
   {
      BOOST_MATH_STD_USING
      if(x <= 1)
@@ -262,8 +268,8 @@ namespace boost { namespace math { namespace detail{
         // Expected Error Term : -5.548e-23
         // Maximum Relative Change in Control Points : 2.002e-03
         // Max Error found at float80 precision = Poly : 9.352785e-22
-         static const T Y = 8.695471286773681640625e-02f;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 8.695471286773681640625e-02f;
+         BOOST_MATH_STATIC const T P[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, -3.621379534403483072861e-03),
            BOOST_MATH_BIG_CONSTANT(T, 64, 7.102135866103952705932e-03),
@@ -271,7 +277,7 @@ namespace boost { namespace math { namespace detail{
            BOOST_MATH_BIG_CONSTANT(T, 64, 2.537484002571894870830e-06),
            BOOST_MATH_BIG_CONSTANT(T, 64, 6.603228256820000135990e-09)
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 64, -4.354457194045068370363e-02),
@@ -287,7 +293,7 @@ namespace boost { namespace math { namespace detail{
         // Expected Error Term : 1.995e-23
         // Maximum Relative Change in Control Points : 8.174e-04
         // Max Error found at float80 precision = Poly : 4.137325e-20
-         static const T P2[] =
+         BOOST_MATH_STATIC const T P2[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, -3.079657578292062244054e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, -7.963049154965966503231e-02),
@@ -295,7 +301,7 @@ namespace boost { namespace math { namespace detail{
            BOOST_MATH_BIG_CONSTANT(T, 64, -4.023052834702215699504e-05),
            BOOST_MATH_BIG_CONSTANT(T, 64, -1.719459155018493821839e-07)
         };
-         static const T Q2[] = 
+         BOOST_MATH_STATIC const T Q2[] = 
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 64, -1.863917670410152669768e-02),
@@ -312,8 +318,8 @@ namespace boost { namespace math { namespace detail{
         // Expected Error Term : -3.302e-21
         // Maximum Relative Change in Control Points : 3.432e-01
         // Max Error found at float80 precision = Poly : 1.083755e-19
-         static const T Y = 1.450342178344726562500e+00f;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 1.450342178344726562500e+00f;
+         BOOST_MATH_STATIC const T P[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, -1.970280410292263112917e-01),
            BOOST_MATH_BIG_CONSTANT(T, 64, -4.058564803062959169322e+00),
@@ -328,7 +334,7 @@ namespace boost { namespace math { namespace detail{
            BOOST_MATH_BIG_CONSTANT(T, 64, 4.319614662598089438939e+00),
            BOOST_MATH_BIG_CONSTANT(T, 64, 3.710715864316521856193e-02)
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 64, 1.000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 64, 2.298433045824439052398e+01),
@@ -353,7 +359,7 @@ namespace boost { namespace math { namespace detail{
   }

   template <typename T>
-   T bessel_k1_imp(const T& x, const std::integral_constant<int, 113>&)
+   BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant<int, 113>&)
   {
      BOOST_MATH_STD_USING
      if(x <= 1)
@@ -362,8 +368,8 @@ namespace boost { namespace math { namespace detail{
         // Expected Error Term : -7.119e-35
         // Maximum Relative Change in Control Points : 1.207e-03
         // Max Error found at float128 precision = Poly : 7.143688e-35
-         static const T Y = 8.695471286773681640625000000000000000e-02f;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 8.695471286773681640625000000000000000e-02f;
+         BOOST_MATH_STATIC const T P[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, -3.621379534403483072916666666666595475e-03),
            BOOST_MATH_BIG_CONSTANT(T, 113, 7.074117676930975433219826471336547627e-03),
@@ -373,7 +379,7 @@ namespace boost { namespace math { namespace detail{
            BOOST_MATH_BIG_CONSTANT(T, 113, 2.347140307321161346703214099534250263e-10),
            BOOST_MATH_BIG_CONSTANT(T, 113, 5.569608494081482873946791086435679661e-13)
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 113, -3.580768910152105375615558920428350204e-02),
@@ -391,7 +397,7 @@ namespace boost { namespace math { namespace detail{
         // Expected Error Term : 4.473e-37
         // Maximum Relative Change in Control Points : 8.550e-04
         // Max Error found at float128 precision = Poly : 8.167701e-35
-         static const T P2[] =
+         BOOST_MATH_STATIC const T P2[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, -3.079657578292062244053600156878870690e-01),
            BOOST_MATH_BIG_CONSTANT(T, 113, -8.133183745732467770755578848987414875e-02),
@@ -401,7 +407,7 @@ namespace boost { namespace math { namespace detail{
            BOOST_MATH_BIG_CONSTANT(T, 113, -1.632502325880313239698965376754406011e-09),
            BOOST_MATH_BIG_CONSTANT(T, 113, -2.311973065898784812266544485665624227e-12)
         };
-         static const T Q2[] = 
+         BOOST_MATH_STATIC const T Q2[] = 
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 113, -1.311471216733781016657962995723287450e-02),
@@ -418,8 +424,8 @@ namespace boost { namespace math { namespace detail{
      {
         // Max error in interpolated form: 5.307e-37
         // Max Error found at float128 precision = Poly: 7.087862e-35
-         static const T Y = 1.5023040771484375f;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 1.5023040771484375f;
+         BOOST_MATH_STATIC const T P[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, -2.489899398329369710528254347931380044e-01),
            BOOST_MATH_BIG_CONSTANT(T, 113, -6.819080211203854781858815596508456873e+00),
@@ -438,7 +444,7 @@ namespace boost { namespace math { namespace detail{
            BOOST_MATH_BIG_CONSTANT(T, 113, 1.039705646510167437971862966128055524e+00),
            BOOST_MATH_BIG_CONSTANT(T, 113, 1.008418100718254816100425022904039530e-02)
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 113, 2.927456835239137986889227412815459529e+01),
@@ -465,8 +471,8 @@ namespace boost { namespace math { namespace detail{
         // Expected Error Term : -6.565e-40
         // Maximum Relative Change in Control Points : 1.880e-01
         // Max Error found at float128 precision = Poly : 2.943572e-35
-         static const T Y = 1.308816909790039062500000000000000000f;
-         static const T P[] =
+         BOOST_MATH_STATIC const T Y = 1.308816909790039062500000000000000000f;
+         BOOST_MATH_STATIC const T P[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, -5.550277247453881129211735759447737350e-02),
            BOOST_MATH_BIG_CONSTANT(T, 113, -3.485883080219574328217554864956175929e+00),
@@ -486,7 +492,7 @@ namespace boost { namespace math { namespace detail{
            BOOST_MATH_BIG_CONSTANT(T, 113, 8.981057433937398731355768088809437625e+05),
            BOOST_MATH_BIG_CONSTANT(T, 113, 2.519440069856232098711793483639792952e+04)
         };
-         static const T Q[] =
+         BOOST_MATH_STATIC const T Q[] =
         {
            BOOST_MATH_BIG_CONSTANT(T, 113, 1.000000000000000000000000000000000000e+00),
            BOOST_MATH_BIG_CONSTANT(T, 113, 7.127348248283623146544565916604103560e+01),
@@ -517,33 +523,33 @@ namespace boost { namespace math { namespace detail{
    }

    template <typename T>
-    T bessel_k1_imp(const T& x, const std::integral_constant<int, 0>&)
+    BOOST_MATH_GPU_ENABLED T bessel_k1_imp(const T& x, const boost::math::integral_constant<int, 0>&)
    {
       if(boost::math::tools::digits<T>() <= 24)
-          return bessel_k1_imp(x, std::integral_constant<int, 24>());
+          return bessel_k1_imp(x, boost::math::integral_constant<int, 24>());
       else if(boost::math::tools::digits<T>() <= 53)
-          return bessel_k1_imp(x, std::integral_constant<int, 53>());
+          return bessel_k1_imp(x, boost::math::integral_constant<int, 53>());
       else if(boost::math::tools::digits<T>() <= 64)
-          return bessel_k1_imp(x, std::integral_constant<int, 64>());
+          return bessel_k1_imp(x, boost::math::integral_constant<int, 64>());
       else if(boost::math::tools::digits<T>() <= 113)
-          return bessel_k1_imp(x, std::integral_constant<int, 113>());
+          return bessel_k1_imp(x, boost::math::integral_constant<int, 113>());
       BOOST_MATH_ASSERT(0);
       return 0;
    }

-    template <typename T>
-   inline T bessel_k1(const T& x)
+   template <typename T>
+   inline BOOST_MATH_GPU_ENABLED T bessel_k1(const T& x)
   {
-      typedef std::integral_constant<int,
-         ((std::numeric_limits<T>::digits == 0) || (std::numeric_limits<T>::radix != 2)) ?
+      typedef boost::math::integral_constant<int,
+         ((boost::math::numeric_limits<T>::digits == 0) || (boost::math::numeric_limits<T>::radix != 2)) ?
         0 :
-         std::numeric_limits<T>::digits <= 24 ?
+         boost::math::numeric_limits<T>::digits <= 24 ?
         24 :
-         std::numeric_limits<T>::digits <= 53 ?
+         boost::math::numeric_limits<T>::digits <= 53 ?
         53 :
-         std::numeric_limits<T>::digits <= 64 ?
+         boost::math::numeric_limits<T>::digits <= 64 ?
         64 :
-         std::numeric_limits<T>::digits <= 113 ?
+         boost::math::numeric_limits<T>::digits <= 113 ?
         113 : -1
      > tag_type;

--- a/include/boost/math/special_functions/detail/bessel_kn.hpp
+++ b/include/boost/math/special_functions/detail/bessel_kn.hpp
@@ -10,8 +10,12 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/policies/error_handling.hpp>
 #include <boost/math/special_functions/detail/bessel_k0.hpp>
 #include <boost/math/special_functions/detail/bessel_k1.hpp>
+#include <boost/math/special_functions/sign.hpp>
 #include <boost/math/policies/error_handling.hpp>

 // Modified Bessel function of the second kind of integer order
@@ -20,14 +24,14 @@
 namespace boost { namespace math { namespace detail{

 template <typename T, typename Policy>
-T bessel_kn(int n, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T bessel_kn(int n, T x, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    T value, current, prev;

    using namespace boost::math::tools;

-    static const char* function = "boost::math::bessel_kn<%1%>(%1%,%1%)";
+    constexpr auto function = "boost::math::bessel_kn<%1%>(%1%,%1%)";

    if (x < 0)
    {
--- a/include/boost/math/special_functions/detail/bessel_y0.hpp
+++ b/include/boost/math/special_functions/detail/bessel_y0.hpp
@@ -12,6 +12,7 @@
 #pragma warning(disable:4702) // Unreachable code (release mode only warning)
 #endif

+#include <boost/math/tools/config.hpp>
 #include <boost/math/special_functions/detail/bessel_j0.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/math/tools/rational.hpp>
@@ -36,12 +37,12 @@
 namespace boost { namespace math { namespace detail{

 template <typename T, typename Policy>
-T bessel_y0(T x, const Policy&);
+BOOST_MATH_GPU_ENABLED T bessel_y0(T x, const Policy&);

 template <typename T, typename Policy>
-T bessel_y0(T x, const Policy&)
+BOOST_MATH_GPU_ENABLED T bessel_y0(T x, const Policy&)
 {
-    static const T P1[] = {
+    BOOST_MATH_STATIC const T P1[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0723538782003176831e+11)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.3716255451260504098e+09)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.0422274357376619816e+08)),
@@ -49,7 +50,7 @@ T bessel_y0(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0102532948020907590e+04)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8402381979244993524e+01)),
    };
-    static const T Q1[] = {
+    BOOST_MATH_STATIC const T Q1[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.8873865738997033405e+11)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.1617187777290363573e+09)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.5662956624278251596e+07)),
@@ -57,7 +58,7 @@ T bessel_y0(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6475986689240190091e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
    };
-    static const T P2[] = {
+    BOOST_MATH_STATIC const T P2[] = {
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -2.2213976967566192242e+13)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -5.5107435206722644429e+11)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.3600098638603061642e+10)),
@@ -66,7 +67,7 @@ T bessel_y0(T x, const Policy&)
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4566865832663635920e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.7427031242901594547e+01)),
    };
-    static const T Q2[] = {
+    BOOST_MATH_STATIC const T Q2[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.3386146580707264428e+14)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4266824419412347550e+12)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.4015103849971240096e+10)),
@@ -75,7 +76,7 @@ T bessel_y0(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.3030857612070288823e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
    };
-    static const T P3[] = {
+    BOOST_MATH_STATIC const T P3[] = {
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.0728726905150210443e+15)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.7016641869173237784e+14)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2829912364088687306e+11)),
@@ -85,7 +86,7 @@ T bessel_y0(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1363534169313901632e+04)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.7439661319197499338e+01)),
    };
-    static const T Q3[] = {
+    BOOST_MATH_STATIC const T Q3[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.4563724628846457519e+17)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.9272425569640309819e+15)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2598377924042897629e+13)),
@@ -95,7 +96,7 @@ T bessel_y0(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.7903362168128450017e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
    };
-    static const T PC[] = {
+    BOOST_MATH_STATIC const T PC[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684302e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1345386639580765797e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1170523380864944322e+04)),
@@ -103,7 +104,7 @@ T bessel_y0(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5376201909008354296e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.8961548424210455236e-01)),
    };
-    static const T QC[] = {
+    BOOST_MATH_STATIC const T QC[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2779090197304684318e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1370412495510416640e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1215350561880115730e+04)),
@@ -111,7 +112,7 @@ T bessel_y0(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.5711159858080893649e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
    };
-    static const T PS[] = {
+    BOOST_MATH_STATIC const T PS[] = {
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.9226600200800094098e+01)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.8591953644342993800e+02)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.1183429920482737611e+02)),
@@ -119,7 +120,7 @@ T bessel_y0(T x, const Policy&)
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2441026745835638459e+00)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -8.8033303048680751817e-03)),
    };
-    static const T QS[] = {
+    BOOST_MATH_STATIC const T QS[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.7105024128512061905e+03)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1951131543434613647e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.2642780169211018836e+03)),
@@ -127,7 +128,7 @@ T bessel_y0(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 9.0593769594993125859e+01)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
    };
-    static const T x1  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.9357696627916752158e-01)),
+    BOOST_MATH_STATIC const T x1  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.9357696627916752158e-01)),
                   x2  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.9576784193148578684e+00)),
                   x3  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0860510603017726976e+00)),
                   x11 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.280e+02)),
--- a/include/boost/math/special_functions/detail/bessel_y1.hpp
+++ b/include/boost/math/special_functions/detail/bessel_y1.hpp
@@ -12,6 +12,7 @@
 #pragma warning(disable:4702) // Unreachable code (release mode only warning)
 #endif

+#include <boost/math/tools/config.hpp>
 #include <boost/math/special_functions/detail/bessel_j1.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/math/tools/rational.hpp>
@@ -36,12 +37,12 @@
 namespace boost { namespace math { namespace detail{

 template <typename T, typename Policy>
-T bessel_y1(T x, const Policy&);
+BOOST_MATH_GPU_ENABLED T bessel_y1(T x, const Policy&);

 template <typename T, typename Policy>
-T bessel_y1(T x, const Policy&)
+BOOST_MATH_GPU_ENABLED T bessel_y1(T x, const Policy&)
 {
-    static const T P1[] = {
+    BOOST_MATH_STATIC const T P1[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.0535726612579544093e+13)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4708611716525426053e+12)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.7595974497819597599e+11)),
@@ -50,7 +51,7 @@ T bessel_y1(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.2157953222280260820e+05)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -3.1714424660046133456e+02)),
    };
-    static const T Q1[] = {
+    BOOST_MATH_STATIC const T Q1[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.0737873921079286084e+14)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.1272286200406461981e+12)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.7800352738690585613e+10)),
@@ -59,7 +60,7 @@ T bessel_y1(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.2079908168393867438e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
    };
-    static const T P2[] = {
+    BOOST_MATH_STATIC const T P2[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.1514276357909013326e+19)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -5.6808094574724204577e+18)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -2.3638408497043134724e+16)),
@@ -70,7 +71,7 @@ T bessel_y1(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.9153806858264202986e+06)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.2337180442012953128e+03)),
    };
-    static const T Q2[] = {
+    BOOST_MATH_STATIC const T Q2[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.3321844313316185697e+20)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.6968198822857178911e+18)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.0837179548112881950e+16)),
@@ -81,7 +82,7 @@ T bessel_y1(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.2855164849321609336e+03)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
    };
-    static const T PC[] = {
+    BOOST_MATH_STATIC const T PC[] = {
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278571e+06)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9422465050776411957e+06)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -6.6033732483649391093e+06)),
@@ -90,7 +91,7 @@ T bessel_y1(T x, const Policy&)
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.6116166443246101165e+03)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)),
    };
-    static const T QC[] = {
+    BOOST_MATH_STATIC const T QC[] = {
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -4.4357578167941278568e+06)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -9.9341243899345856590e+06)),
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -6.5853394797230870728e+06)),
@@ -99,7 +100,7 @@ T bessel_y1(T x, const Policy&)
        static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -1.4550094401904961825e+03)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
    };
-    static const T PS[] = {
+    BOOST_MATH_STATIC const T PS[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.3220913409857223519e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.5145160675335701966e+04)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 6.6178836581270835179e+04)),
@@ -108,7 +109,7 @@ T bessel_y1(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3.5265133846636032186e+01)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.0)),
    };
-    static const T QS[] = {
+    BOOST_MATH_STATIC const T QS[] = {
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 7.0871281941028743574e+05)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8194580422439972989e+06)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.4194606696037208929e+06)),
@@ -117,7 +118,7 @@ T bessel_y1(T x, const Policy&)
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 8.6383677696049909675e+02)),
         static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.0)),
    };
-    static const T x1  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1971413260310170351e+00)),
+    BOOST_MATH_STATIC const T x1  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.1971413260310170351e+00)),
                   x2  =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.4296810407941351328e+00)),
                   x11 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 5.620e+02)),
                   x12 =  static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 1.8288260310170351490e-03)),
--- a/include/boost/math/special_functions/detail/bessel_yn.hpp
+++ b/include/boost/math/special_functions/detail/bessel_yn.hpp
@@ -10,9 +10,11 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
 #include <boost/math/special_functions/detail/bessel_y0.hpp>
 #include <boost/math/special_functions/detail/bessel_y1.hpp>
 #include <boost/math/special_functions/detail/bessel_jy_series.hpp>
+#include <boost/math/special_functions/sign.hpp>
 #include <boost/math/policies/error_handling.hpp>

 // Bessel function of the second kind of integer order
@@ -21,14 +23,14 @@
 namespace boost { namespace math { namespace detail{

 template <typename T, typename Policy>
-T bessel_yn(int n, T x, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T bessel_yn(int n, T x, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    T value, factor, current, prev;

    using namespace boost::math::tools;

-    static const char* function = "boost::math::bessel_yn<%1%>(%1%,%1%)";
+    constexpr auto function = "boost::math::bessel_yn<%1%>(%1%,%1%)";

    if ((x == 0) && (n == 0))
    {
--- a/include/boost/math/special_functions/detail/iconv.hpp
+++ b/include/boost/math/special_functions/detail/iconv.hpp
@@ -10,28 +10,29 @@
 #pragma once
 #endif

-#include <type_traits>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
 #include <boost/math/special_functions/round.hpp>

 namespace boost { namespace math { namespace detail{

 template <class T, class Policy>
-inline int iconv_imp(T v, Policy const&, std::true_type const&)
+BOOST_MATH_GPU_ENABLED inline int iconv_imp(T v, Policy const&, boost::math::true_type const&)
 {
   return static_cast<int>(v);
 }

 template <class T, class Policy>
-inline int iconv_imp(T v, Policy const& pol, std::false_type const&)
+BOOST_MATH_GPU_ENABLED inline int iconv_imp(T v, Policy const& pol, boost::math::false_type const&)
 {
   BOOST_MATH_STD_USING
   return iround(v, pol);
 }

 template <class T, class Policy>
-inline int iconv(T v, Policy const& pol)
+BOOST_MATH_GPU_ENABLED inline int iconv(T v, Policy const& pol)
 {
-   typedef typename std::is_convertible<T, int>::type tag_type;
+   typedef typename boost::math::is_convertible<T, int>::type tag_type;
   return iconv_imp(v, pol, tag_type());
 }

--- a/include/boost/math/special_functions/detail/unchecked_factorial.hpp
+++ b/include/boost/math/special_functions/detail/unchecked_factorial.hpp
@@ -10,19 +10,23 @@
 #pragma once
 #endif

-#ifdef _MSC_VER
-#pragma warning(push) // Temporary until lexical cast fixed.
-#pragma warning(disable: 4127 4701)
-#endif
-#include <boost/math/tools/convert_from_string.hpp>
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-#include <cmath>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/array.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
-#include <boost/math/tools/cxx03_warn.hpp>
-#include <array>
-#include <type_traits>
+
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+#  ifdef _MSC_VER
+#    pragma warning(push) // Temporary until lexical cast fixed.
+#    pragma warning(disable: 4127 4701)
+#  endif
+#  include <boost/math/tools/convert_from_string.hpp>
+#  ifdef _MSC_VER
+#    pragma warning(pop)
+#  endif
+#endif
+

 #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
 //
@@ -46,13 +50,21 @@ struct max_factorial;
 template <class T, bool = true>
 struct unchecked_factorial_data;

+#ifdef BOOST_MATH_HAS_NVRTC
+
+// Need fwd decl
+template <typename T>
+BOOST_MATH_GPU_ENABLED inline T unchecked_factorial(unsigned i);
+
+#endif
+
 #ifndef BOOST_MATH_HAS_GPU_SUPPORT

 template <bool b>
 struct unchecked_factorial_data<float, b>
 {
 #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
-   static constexpr std::array<float, 35> factorials = { {
+   static constexpr boost::math::array<float, 35> factorials = { {
      1.0F,
      1.0F,
      2.0F,
@@ -90,15 +102,15 @@ struct unchecked_factorial_data<float, b>
      0.29523279903960414084761860964352e39F,
   }};
 #else
-   static const std::array<float, 35> factorials;
+   static const boost::math::array<float, 35> factorials;
 #endif
 };

 template<bool b>
 #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
-   constexpr std::array<float, 35> unchecked_factorial_data<float, b>::factorials;
+   constexpr boost::math::array<float, 35> unchecked_factorial_data<float, b>::factorials;
 #else
-   const std::array<float, 35> unchecked_factorial_data<float, b>::factorials = {{
+   const boost::math::array<float, 35> unchecked_factorial_data<float, b>::factorials = {{
      1.0F,
      1.0F,
      2.0F,
@@ -204,7 +216,7 @@ template <bool b>
 struct unchecked_factorial_data<double, b>
 {
 #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
-   static constexpr std::array<double, 171> factorials = { {
+   static constexpr boost::math::array<double, 171> factorials = { {
      1.0,
      1.0,
      2.0,
@@ -378,15 +390,15 @@ struct unchecked_factorial_data<double, b>
      0.7257415615307998967396728211129263114717e307,
   }};
 #else
-   static const std::array<double, 171> factorials;
+   static const boost::math::array<double, 171> factorials;
 #endif
 };

 template <bool b>
 #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
-   constexpr std::array<double, 171> unchecked_factorial_data<double, b>::factorials;
+   constexpr boost::math::array<double, 171> unchecked_factorial_data<double, b>::factorials;
 #else
-   const std::array<double, 171> unchecked_factorial_data<double, b>::factorials = {{
+   const boost::math::array<double, 171> unchecked_factorial_data<double, b>::factorials = {{
      1.0,
      1.0,
      2.0,
@@ -633,7 +645,7 @@ template <bool b>
 struct unchecked_factorial_data<long double, b>
 {
 #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
-   static constexpr std::array<long double, 171> factorials = { {
+   static constexpr boost::math::array<long double, 171> factorials = { {
      1L,
      1L,
      2L,
@@ -807,15 +819,15 @@ struct unchecked_factorial_data<long double, b>
      0.7257415615307998967396728211129263114717e307L,
   }};
 #else
-   static const std::array<long double, 171> factorials;
+   static const boost::math::array<long double, 171> factorials;
 #endif
 };

 template <bool b>
 #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
-   constexpr std::array<long double, 171> unchecked_factorial_data<long double, b>::factorials;
+   constexpr boost::math::array<long double, 171> unchecked_factorial_data<long double, b>::factorials;
 #else
-   const std::array<long double, 171> unchecked_factorial_data<long double, b>::factorials = {{
+   const boost::math::array<long double, 171> unchecked_factorial_data<long double, b>::factorials = {{
      1L,
      1L,
      2L,
@@ -1008,7 +1020,7 @@ template <bool b>
 struct unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>
 {
 #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
-   static constexpr std::array<BOOST_MATH_FLOAT128_TYPE, 171> factorials = { {
+   static constexpr boost::math::array<BOOST_MATH_FLOAT128_TYPE, 171> factorials = { {
      1,
      1,
      2,
@@ -1182,15 +1194,15 @@ struct unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>
      0.7257415615307998967396728211129263114717e307Q,
   } };
 #else
-   static const std::array<BOOST_MATH_FLOAT128_TYPE, 171> factorials;
+   static const boost::math::array<BOOST_MATH_FLOAT128_TYPE, 171> factorials;
 #endif
 };

 template <bool b>
 #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
-constexpr std::array<BOOST_MATH_FLOAT128_TYPE, 171> unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>::factorials;
+constexpr boost::math::array<BOOST_MATH_FLOAT128_TYPE, 171> unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>::factorials;
 #else
-const std::array<BOOST_MATH_FLOAT128_TYPE, 171> unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>::factorials = { {
+const boost::math::array<BOOST_MATH_FLOAT128_TYPE, 171> unchecked_factorial_data<BOOST_MATH_FLOAT128_TYPE, b>::factorials = { {
      1,
      1,
      2,
@@ -1402,7 +1414,7 @@ const typename unchecked_factorial_initializer<T>::init unchecked_factorial_init


 template <class T, int N>
-inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, N>&)
+inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, N>&)
 {
   //
   // If you're foolish enough to instantiate factorial
@@ -1416,10 +1428,10 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, N
   // unsigned int nfac = static_cast<unsigned int>(factorial<double>(n));
   // See factorial documentation for more detail.
   //
-   static_assert(!std::is_integral<T>::value && !std::numeric_limits<T>::is_integer, "Type T must not be an integral type");
+   static_assert(!boost::math::is_integral<T>::value && !boost::math::numeric_limits<T>::is_integer, "Type T must not be an integral type");

   // We rely on C++11 thread safe initialization here:
-   static const std::array<T, 101> factorials = {{
+   static const boost::math::array<T, 101> factorials = {{
      T(boost::math::tools::convert_from_string<T>("1")),
      T(boost::math::tools::convert_from_string<T>("1")),
      T(boost::math::tools::convert_from_string<T>("2")),
@@ -1527,7 +1539,7 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, N
 }

 template <class T>
-inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 0>&)
+inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, 0>&)
 {
   //
   // If you're foolish enough to instantiate factorial
@@ -1541,7 +1553,7 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 0
   // unsigned int nfac = static_cast<unsigned int>(factorial<double>(n));
   // See factorial documentation for more detail.
   //
-   static_assert(!std::is_integral<T>::value && !std::numeric_limits<T>::is_integer, "Type T must not be an integral type");
+   static_assert(!boost::math::is_integral<T>::value && !boost::math::numeric_limits<T>::is_integer, "Type T must not be an integral type");

   static const char* const factorial_strings[] = {
         "1",
@@ -1667,13 +1679,13 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 0
 #endif // BOOST_MATH_HAS_GPU_SUPPORT

 template <class T>
-inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, std::numeric_limits<float>::digits>&)
+BOOST_MATH_GPU_ENABLED inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, boost::math::numeric_limits<float>::digits>&)
 {
   return unchecked_factorial<float>(i);
 }

 template <class T>
-inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, std::numeric_limits<double>::digits>&)
+BOOST_MATH_GPU_ENABLED inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, boost::math::numeric_limits<double>::digits>&)
 {
   return unchecked_factorial<double>(i);
 }
@@ -1682,14 +1694,14 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, s

 #if DBL_MANT_DIG != LDBL_MANT_DIG
 template <class T>
-inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, LDBL_MANT_DIG>&)
+inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, LDBL_MANT_DIG>&)
 {
   return unchecked_factorial<long double>(i);
 }
 #endif
 #ifdef BOOST_MATH_USE_FLOAT128
 template <class T>
-inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 113>&)
+inline T unchecked_factorial_imp(unsigned i, const boost::math::integral_constant<int, 113>&)
 {
   return unchecked_factorial<BOOST_MATH_FLOAT128_TYPE>(i);
 }
@@ -1698,14 +1710,14 @@ inline T unchecked_factorial_imp(unsigned i, const std::integral_constant<int, 1
 #endif // BOOST_MATH_HAS_GPU_SUPPORT

 template <class T>
-inline T unchecked_factorial(unsigned i)
+BOOST_MATH_GPU_ENABLED inline T unchecked_factorial(unsigned i)
 {
   typedef typename boost::math::policies::precision<T, boost::math::policies::policy<> >::type tag_type;
   return unchecked_factorial_imp<T>(i, tag_type());
 }

 #ifdef BOOST_MATH_USE_FLOAT128
-#define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL : std::numeric_limits<T>::digits == 113 ? max_factorial<BOOST_MATH_FLOAT128_TYPE>::value
+#define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL : boost::math::numeric_limits<T>::digits == 113 ? max_factorial<BOOST_MATH_FLOAT128_TYPE>::value
 #else
 #define BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL
 #endif
@@ -1714,10 +1726,10 @@ template <class T>
 struct max_factorial
 {
   static constexpr unsigned value = 
-      std::numeric_limits<T>::digits == std::numeric_limits<float>::digits ? max_factorial<float>::value 
-      : std::numeric_limits<T>::digits == std::numeric_limits<double>::digits ? max_factorial<double>::value
+      boost::math::numeric_limits<T>::digits == boost::math::numeric_limits<float>::digits ? max_factorial<float>::value 
+      : boost::math::numeric_limits<T>::digits == boost::math::numeric_limits<double>::digits ? max_factorial<double>::value
      #ifndef BOOST_MATH_GPU_ENABLED 
-      : std::numeric_limits<T>::digits == std::numeric_limits<long double>::digits ? max_factorial<long double>::value 
+      : boost::math::numeric_limits<T>::digits == boost::math::numeric_limits<long double>::digits ? max_factorial<long double>::value 
      BOOST_MATH_DETAIL_FLOAT128_MAX_FACTORIAL
      #endif
      : 100;
--- a/include/boost/math/special_functions/expm1.hpp
+++ b/include/boost/math/special_functions/expm1.hpp
@@ -15,9 +15,6 @@

 #ifndef BOOST_MATH_HAS_NVRTC

-#include <cmath>
-#include <cstdint>
-#include <limits>
 #include <boost/math/tools/series.hpp>
 #include <boost/math/tools/precision.hpp>
 #include <boost/math/tools/big_constant.hpp>
@@ -25,6 +22,9 @@
 #include <boost/math/tools/rational.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/tools/assert.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/cstdint.hpp>

 #if defined(__GNUC__) && defined(BOOST_MATH_USE_FLOAT128)
 //
@@ -49,10 +49,10 @@ namespace detail
  {
     typedef T result_type;

-     expm1_series(T x)
+     BOOST_MATH_GPU_ENABLED expm1_series(T x)
        : k(0), m_x(x), m_term(1) {}

-     T operator()()
+     BOOST_MATH_GPU_ENABLED T operator()()
     {
        ++k;
        m_term *= m_x;
@@ -60,7 +60,7 @@ namespace detail
        return m_term;
     }

-     int count()const
+     BOOST_MATH_GPU_ENABLED int count()const
     {
        return k;
     }
@@ -78,26 +78,28 @@ struct expm1_initializer
 {
   struct init
   {
-      init()
+      BOOST_MATH_GPU_ENABLED init()
      {
         do_init(tag());
      }
      template <int N>
-      static void do_init(const std::integral_constant<int, N>&){}
-      static void do_init(const std::integral_constant<int, 64>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, N>&){}
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 64>&)
      {
         expm1(T(0.5));
      }
-      static void do_init(const std::integral_constant<int, 113>&)
+      BOOST_MATH_GPU_ENABLED static void do_init(const boost::math::integral_constant<int, 113>&)
      {
         expm1(T(0.5));
      }
-      void force_instantiate()const{}
+      BOOST_MATH_GPU_ENABLED void force_instantiate()const{}
   };
-   static const init initializer;
-   static void force_instantiate()
+   BOOST_MATH_STATIC const init initializer;
+   BOOST_MATH_GPU_ENABLED static void force_instantiate()
   {
+      #ifndef BOOST_MATH_HAS_GPU_SUPPORT
      initializer.force_instantiate();
+      #endif
   }
 };

@@ -110,7 +112,7 @@ const typename expm1_initializer<T, Policy, tag>::init expm1_initializer<T, Poli
 // This version uses a Taylor series expansion for 0.5 > |x| > epsilon.
 //
 template <class T, class Policy>
-T expm1_imp(T x, const std::integral_constant<int, 0>&, const Policy& pol)
+T expm1_imp(T x, const boost::math::integral_constant<int, 0>&, const Policy& pol)
 {
   BOOST_MATH_STD_USING

@@ -132,7 +134,7 @@ T expm1_imp(T x, const std::integral_constant<int, 0>&, const Policy& pol)
   if(a < tools::epsilon<T>())
      return x;
   detail::expm1_series<T> s(x);
-   std::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();
+   boost::math::uintmax_t max_iter = policies::get_max_series_iterations<Policy>();

   T result = tools::sum_series(s, policies::get_epsilon<T, Policy>(), max_iter);

@@ -141,7 +143,7 @@ T expm1_imp(T x, const std::integral_constant<int, 0>&, const Policy& pol)
 }

 template <class T, class P>
-T expm1_imp(T x, const std::integral_constant<int, 53>&, const P& pol)
+BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant<int, 53>&, const P& pol)
 {
   BOOST_MATH_STD_USING

@@ -159,16 +161,16 @@ T expm1_imp(T x, const std::integral_constant<int, 53>&, const P& pol)
   if(a < tools::epsilon<T>())
      return x;

-   static const float Y = 0.10281276702880859e1f;
-   static const T n[] = { static_cast<T>(-0.28127670288085937e-1), static_cast<T>(0.51278186299064534e0), static_cast<T>(-0.6310029069350198e-1), static_cast<T>(0.11638457975729296e-1), static_cast<T>(-0.52143390687521003e-3), static_cast<T>(0.21491399776965688e-4) };
-   static const T d[] = { 1, static_cast<T>(-0.45442309511354755e0), static_cast<T>(0.90850389570911714e-1), static_cast<T>(-0.10088963629815502e-1), static_cast<T>(0.63003407478692265e-3), static_cast<T>(-0.17976570003654402e-4) };
+   BOOST_MATH_STATIC const float Y = 0.10281276702880859e1f;
+   BOOST_MATH_STATIC const T n[] = { static_cast<T>(-0.28127670288085937e-1), static_cast<T>(0.51278186299064534e0), static_cast<T>(-0.6310029069350198e-1), static_cast<T>(0.11638457975729296e-1), static_cast<T>(-0.52143390687521003e-3), static_cast<T>(0.21491399776965688e-4) };
+   BOOST_MATH_STATIC const T d[] = { 1, static_cast<T>(-0.45442309511354755e0), static_cast<T>(0.90850389570911714e-1), static_cast<T>(-0.10088963629815502e-1), static_cast<T>(0.63003407478692265e-3), static_cast<T>(-0.17976570003654402e-4) };

   T result = x * Y + x * tools::evaluate_polynomial(n, x) / tools::evaluate_polynomial(d, x);
   return result;
 }

 template <class T, class P>
-T expm1_imp(T x, const std::integral_constant<int, 64>&, const P& pol)
+BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant<int, 64>&, const P& pol)
 {
   BOOST_MATH_STD_USING

@@ -186,8 +188,8 @@ T expm1_imp(T x, const std::integral_constant<int, 64>&, const P& pol)
   if(a < tools::epsilon<T>())
      return x;

-   static const float Y = 0.10281276702880859375e1f;
-   static const T n[] = {
+   BOOST_MATH_STATIC const float Y = 0.10281276702880859375e1f;
+   BOOST_MATH_STATIC const T n[] = {
      BOOST_MATH_BIG_CONSTANT(T, 64, -0.281276702880859375e-1),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.512980290285154286358e0),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.667758794592881019644e-1),
@@ -196,7 +198,7 @@ T expm1_imp(T x, const std::integral_constant<int, 64>&, const P& pol)
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.447441185192951335042e-4),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.714539134024984593011e-6)
   };
-   static const T d[] = {
+   BOOST_MATH_STATIC const T d[] = {
      BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
      BOOST_MATH_BIG_CONSTANT(T, 64, -0.461477618025562520389e0),
      BOOST_MATH_BIG_CONSTANT(T, 64, 0.961237488025708540713e-1),
@@ -211,7 +213,7 @@ T expm1_imp(T x, const std::integral_constant<int, 64>&, const P& pol)
 }

 template <class T, class P>
-T expm1_imp(T x, const std::integral_constant<int, 113>&, const P& pol)
+BOOST_MATH_GPU_ENABLED T expm1_imp(T x, const boost::math::integral_constant<int, 113>&, const P& pol)
 {
   BOOST_MATH_STD_USING

@@ -263,7 +265,7 @@ T expm1_imp(T x, const std::integral_constant<int, 113>&, const P& pol)
 } // namespace detail

 template <class T, class Policy>
-inline typename tools::promote_args<T>::type expm1(T x, const Policy& /* pol */)
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type expm1(T x, const Policy& /* pol */)
 {
   typedef typename tools::promote_args<T>::type result_type;
   typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -275,7 +277,7 @@ inline typename tools::promote_args<T>::type expm1(T x, const Policy& /* pol */)
      policies::discrete_quantile<>,
      policies::assert_undefined<> >::type forwarding_policy;

-   typedef std::integral_constant<int,
+   typedef boost::math::integral_constant<int,
      precision_type::value <= 0 ? 0 :
      precision_type::value <= 53 ? 53 :
      precision_type::value <= 64 ? 64 :
--- a/include/boost/math/special_functions/factorials.hpp
+++ b/include/boost/math/special_functions/factorials.hpp
@@ -10,10 +10,14 @@
 #pragma once
 #endif

-#include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/policies/error_handling.hpp>
 #include <boost/math/special_functions/gamma.hpp>
 #include <boost/math/special_functions/detail/unchecked_factorial.hpp>
-#include <array>
+#include <boost/math/special_functions/math_fwd.hpp>
+
 #ifdef _MSC_VER
 #pragma warning(push) // Temporary until lexical cast fixed.
 #pragma warning(disable: 4127 4701)
@@ -21,16 +25,14 @@
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
-#include <type_traits>
-#include <cmath>

 namespace boost { namespace math
 {

 template <class T, class Policy>
-inline T factorial(unsigned i, const Policy& pol)
+BOOST_MATH_GPU_ENABLED inline T factorial(unsigned i, const Policy& pol)
 {
-   static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
+   static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
   // factorial<unsigned int>(n) is not implemented
   // because it would overflow integral type T for too small n
   // to be useful. Use instead a floating-point type,
@@ -49,7 +51,7 @@ inline T factorial(unsigned i, const Policy& pol)
 }

 template <class T>
-inline T factorial(unsigned i)
+BOOST_MATH_GPU_ENABLED inline T factorial(unsigned i)
 {
   return factorial<T>(i, policies::policy<>());
 }
@@ -72,9 +74,9 @@ inline double factorial<double>(unsigned i)
 }
 */
 template <class T, class Policy>
-T double_factorial(unsigned i, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T double_factorial(unsigned i, const Policy& pol)
 {
-   static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
+   static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
   BOOST_MATH_STD_USING  // ADL lookup of std names
   if(i & 1)
   {
@@ -107,17 +109,20 @@ T double_factorial(unsigned i, const Policy& pol)
 }

 template <class T>
-inline T double_factorial(unsigned i)
+BOOST_MATH_GPU_ENABLED inline T double_factorial(unsigned i)
 {
   return double_factorial<T>(i, policies::policy<>());
 }

+// TODO(mborland): We do not currently have support for tgamma_delta_ratio
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+
 namespace detail{

 template <class T, class Policy>
 T rising_factorial_imp(T x, int n, const Policy& pol)
 {
-   static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
+   static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
   if(x < 0)
   {
      //
@@ -165,7 +170,7 @@ T rising_factorial_imp(T x, int n, const Policy& pol)
 template <class T, class Policy>
 inline T falling_factorial_imp(T x, unsigned n, const Policy& pol)
 {
-   static_assert(!std::is_integral<T>::value, "Type T must not be an integral type");
+   static_assert(!boost::math::is_integral<T>::value, "Type T must not be an integral type");
   BOOST_MATH_STD_USING // ADL of std names
   if(x == 0)
      return 0;
@@ -262,6 +267,8 @@ inline typename tools::promote_args<RT>::type
      static_cast<result_type>(x), n, pol);
 }

+#endif // BOOST_MATH_HAS_GPU_SUPPORT
+
 } // namespace math
 } // namespace boost

--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -2287,6 +2287,7 @@ BOOST_MATH_GPU_ENABLED inline tools::promote_args_t<T1, T2>
 #else

 #include <boost/math/tools/config.hpp>
+#include <boost/math/special_functions/expm1.hpp>

 namespace boost {
 namespace math {
@@ -2295,7 +2296,7 @@ inline BOOST_MATH_GPU_ENABLED float tgamma(float x) { return ::tgammaf(x); }
 inline BOOST_MATH_GPU_ENABLED double tgamma(double x) { return ::tgamma(x); }

 template <typename T, typename Policy>
-inline BOOST_MATH_GPU_ENABLED T tgamma(T x, const Policy&)
+BOOST_MATH_GPU_ENABLED T tgamma(T x, const Policy&)
 {
   return boost::math::tgamma(x);
 }
@@ -2304,11 +2305,49 @@ inline BOOST_MATH_GPU_ENABLED float lgamma(float x) { return ::lgammaf(x); }
 inline BOOST_MATH_GPU_ENABLED double lgamma(double x) { return ::lgamma(x); }

 template <typename T, typename Policy>
-inline BOOST_MATH_GPU_ENABLED T lgamma(T x, const Policy&)
+BOOST_MATH_GPU_ENABLED T lgamma(T x, const Policy&)
 {
   return boost::math::lgamma(x);
 }

+template <typename T, typename Policy>
+BOOST_MATH_GPU_ENABLED T lgamma(T x, int* sign, const Policy&)
+{
+   auto res = boost::math::lgamma(x);
+   if (sign != nullptr)
+   {
+      if (res < 0)
+      {
+         *sign = -1;
+      }
+      else
+      {
+         *sign = 1;
+      }
+   }
+
+   return res;
+}
+
+template <typename T>
+BOOST_MATH_GPU_ENABLED T tgamma1pm1(T z)
+{
+   using namespace boost::math;
+
+   if (fabs(z) < T(0.55))
+   {
+      return expm1(lgamma(z));
+   }
+
+   return expm1(lgamma(1 + z));
+}
+
+template <typename T, typename Policy>
+BOOST_MATH_GPU_ENABLED T tgamma1pm1(T x, const Policy&)
+{
+   return tgamma1pm1(x);
+}
+
 } // namespace math
 } // namespace boost

--- a/include/boost/math/special_functions/hypot.hpp
+++ b/include/boost/math/special_functions/hypot.hpp
@@ -12,20 +12,20 @@

 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/type_traits.hpp>
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
-#include <algorithm> // for swap
-#include <cmath>

 namespace boost{ namespace math{ namespace detail{

 template <class T, class Policy>
-T hypot_imp(T x, T y, const Policy& pol)
+BOOST_MATH_GPU_ENABLED T hypot_imp(T x, T y, const Policy& pol)
 {
   //
   // Normalize x and y, so that both are positive and x >= y:
   //
-   using std::fabs; using std::sqrt; // ADL of std names
+   BOOST_MATH_STD_USING

   x = fabs(x);
   y = fabs(y);
@@ -35,16 +35,16 @@ T hypot_imp(T x, T y, const Policy& pol)
 #pragma warning(disable: 4127)
 #endif
   // special case, see C99 Annex F:
-   if(std::numeric_limits<T>::has_infinity
-      && ((x == std::numeric_limits<T>::infinity())
-      || (y == std::numeric_limits<T>::infinity())))
+   if(boost::math::numeric_limits<T>::has_infinity
+      && ((x == boost::math::numeric_limits<T>::infinity())
+      || (y == boost::math::numeric_limits<T>::infinity())))
      return policies::raise_overflow_error<T>("boost::math::hypot<%1%>(%1%,%1%)", nullptr, pol);
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif

   if(y > x)
-      (std::swap)(x, y);
+      BOOST_MATH_GPU_SAFE_SWAP(x, y);

   if(x * tools::epsilon<T>() >= y)
      return x;
@@ -56,7 +56,7 @@ T hypot_imp(T x, T y, const Policy& pol)
 }

 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2>::type
   hypot(T1 x, T2 y)
 {
   typedef typename tools::promote_args<T1, T2>::type result_type;
@@ -65,7 +65,7 @@ inline typename tools::promote_args<T1, T2>::type
 }

 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type
+BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T1, T2>::type
   hypot(T1 x, T2 y, const Policy& pol)
 {
   typedef typename tools::promote_args<T1, T2>::type result_type;
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -24,12 +24,16 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+
 #include <vector>
 #include <complex>
 #include <type_traits>
-#include <boost/math/tools/config.hpp>
 #include <boost/math/special_functions/detail/round_fwd.hpp>
 #include <boost/math/tools/promotion.hpp> // for argument promotion.
+#include <boost/math/tools/type_traits.hpp>
 #include <boost/math/policies/policy.hpp>

 #define BOOST_NO_MACRO_EXPAND /**/
@@ -420,15 +424,15 @@ namespace boost
   template <class RT>
   struct max_factorial;
   template <class RT>
-   RT factorial(unsigned int);
+   BOOST_MATH_GPU_ENABLED RT factorial(unsigned int);
   template <class RT, class Policy>
-   RT factorial(unsigned int, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED RT factorial(unsigned int, const Policy& pol);
   template <class RT>
   BOOST_MATH_GPU_ENABLED RT unchecked_factorial(unsigned int BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(RT));
   template <class RT>
-   RT double_factorial(unsigned i);
+   BOOST_MATH_GPU_ENABLED RT double_factorial(unsigned i);
   template <class RT, class Policy>
-   RT double_factorial(unsigned i, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED RT double_factorial(unsigned i, const Policy& pol);

   template <class RT>
   tools::promote_args_t<RT> falling_factorial(RT x, unsigned n);
@@ -554,11 +558,11 @@ namespace boost

   // Hypotenuse function sqrt(x ^ 2 + y ^ 2).
   template <class T1, class T2>
-   tools::promote_args_t<T1, T2>
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
         hypot(T1 x, T2 y);

   template <class T1, class T2, class Policy>
-   tools::promote_args_t<T1, T2>
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
         hypot(T1 x, T2 y, const Policy&);

   // cbrt - cube root.
@@ -607,10 +611,10 @@ namespace boost

   // sinus cardinals:
   template <class T>
-   tools::promote_args_t<T> sinc_pi(T x);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> sinc_pi(T x);

   template <class T, class Policy>
-   tools::promote_args_t<T> sinc_pi(T x, const Policy&);
+   BOOST_MATH_GPU_ENABLED tools::promote_args_t<T> sinc_pi(T x, const Policy&);

   template <class T>
   tools::promote_args_t<T> sinhc_pi(T x);
@@ -639,36 +643,36 @@ namespace boost

   namespace detail{

-      typedef std::integral_constant<int, 0> bessel_no_int_tag;      // No integer optimisation possible.
-      typedef std::integral_constant<int, 1> bessel_maybe_int_tag;   // Maybe integer optimisation.
-      typedef std::integral_constant<int, 2> bessel_int_tag;         // Definite integer optimisation.
+      typedef boost::math::integral_constant<int, 0> bessel_no_int_tag;      // No integer optimisation possible.
+      typedef boost::math::integral_constant<int, 1> bessel_maybe_int_tag;   // Maybe integer optimisation.
+      typedef boost::math::integral_constant<int, 2> bessel_int_tag;         // Definite integer optimisation.

      template <class T1, class T2, class Policy>
      struct bessel_traits
      {
-         using result_type = typename std::conditional<
-            std::is_integral<T1>::value,
+         using result_type = typename boost::math::conditional<
+            boost::math::is_integral<T1>::value,
            typename tools::promote_args<T2>::type,
            tools::promote_args_t<T1, T2>
         >::type;

         typedef typename policies::precision<result_type, Policy>::type precision_type;

-         using optimisation_tag = typename std::conditional<
+         using optimisation_tag = typename boost::math::conditional<
            (precision_type::value <= 0 || precision_type::value > 64),
            bessel_no_int_tag,
-            typename std::conditional<
-               std::is_integral<T1>::value,
+            typename boost::math::conditional<
+               boost::math::is_integral<T1>::value,
               bessel_int_tag,
               bessel_maybe_int_tag
            >::type
         >::type;

-         using optimisation_tag128 = typename std::conditional<
+         using optimisation_tag128 = typename boost::math::conditional<
            (precision_type::value <= 0 || precision_type::value > 113),
            bessel_no_int_tag,
-            typename std::conditional<
-               std::is_integral<T1>::value,
+            typename boost::math::conditional<
+               boost::math::is_integral<T1>::value,
               bessel_int_tag,
               bessel_maybe_int_tag
            >::type
@@ -678,98 +682,98 @@ namespace boost

   // Bessel functions:
   template <class T1, class T2, class Policy>
-   typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j(T1 v, T2 x, const Policy& pol);
   template <class T1, class T2, class Policy>
-   typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j_prime(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_j_prime(T1 v, T2 x, const Policy& pol);

   template <class T1, class T2>
-   typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j(T1 v, T2 x);
   template <class T1, class T2>
-   typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j_prime(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_j_prime(T1 v, T2 x);

   template <class T, class Policy>
-   typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel(unsigned v, T x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel(unsigned v, T x, const Policy& pol);
   template <class T, class Policy>
-   typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel_prime(unsigned v, T x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type sph_bessel_prime(unsigned v, T x, const Policy& pol);

   template <class T>
-   typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel(unsigned v, T x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel(unsigned v, T x);
   template <class T>
-   typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel_prime(unsigned v, T x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_bessel_prime(unsigned v, T x);

   template <class T1, class T2, class Policy>
-   typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i(T1 v, T2 x, const Policy& pol);
   template <class T1, class T2, class Policy>
-   typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i_prime(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_i_prime(T1 v, T2 x, const Policy& pol);

   template <class T1, class T2>
-   typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i(T1 v, T2 x);
   template <class T1, class T2>
-   typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i_prime(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_i_prime(T1 v, T2 x);

   template <class T1, class T2, class Policy>
-   typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k(T1 v, T2 x, const Policy& pol);
   template <class T1, class T2, class Policy>
-   typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k_prime(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_bessel_k_prime(T1 v, T2 x, const Policy& pol);

   template <class T1, class T2>
-   typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k(T1 v, T2 x);
   template <class T1, class T2>
-   typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k_prime(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_bessel_k_prime(T1 v, T2 x);

   template <class T1, class T2, class Policy>
-   typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann(T1 v, T2 x, const Policy& pol);
   template <class T1, class T2, class Policy>
-   typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann_prime(T1 v, T2 x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, Policy>::result_type cyl_neumann_prime(T1 v, T2 x, const Policy& pol);

   template <class T1, class T2>
-   typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann(T1 v, T2 x);
   template <class T1, class T2>
-   typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann_prime(T1 v, T2 x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T1, T2, policies::policy<> >::result_type cyl_neumann_prime(T1 v, T2 x);

   template <class T, class Policy>
-   typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann(unsigned v, T x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann(unsigned v, T x, const Policy& pol);
   template <class T, class Policy>
-   typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann_prime(unsigned v, T x, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type sph_neumann_prime(unsigned v, T x, const Policy& pol);

   template <class T>
-   typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann(unsigned v, T x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann(unsigned v, T x);
   template <class T>
-   typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann_prime(unsigned v, T x);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type sph_neumann_prime(unsigned v, T x);

   template <class T, class Policy>
-   typename detail::bessel_traits<T, T, Policy>::result_type cyl_bessel_j_zero(T v, int m, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type cyl_bessel_j_zero(T v, int m, const Policy& pol);

   template <class T>
-   typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_bessel_j_zero(T v, int m);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_bessel_j_zero(T v, int m);

   template <class T, class OutputIterator>
-   OutputIterator cyl_bessel_j_zero(T v,
+   BOOST_MATH_GPU_ENABLED OutputIterator cyl_bessel_j_zero(T v,
                          int start_index,
                          unsigned number_of_zeros,
                          OutputIterator out_it);

   template <class T, class OutputIterator, class Policy>
-   OutputIterator cyl_bessel_j_zero(T v,
+   BOOST_MATH_GPU_ENABLED OutputIterator cyl_bessel_j_zero(T v,
                          int start_index,
                          unsigned number_of_zeros,
                          OutputIterator out_it,
                          const Policy&);

   template <class T, class Policy>
-   typename detail::bessel_traits<T, T, Policy>::result_type cyl_neumann_zero(T v, int m, const Policy& pol);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, Policy>::result_type cyl_neumann_zero(T v, int m, const Policy& pol);

   template <class T>
-   typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_neumann_zero(T v, int m);
+   BOOST_MATH_GPU_ENABLED typename detail::bessel_traits<T, T, policies::policy<> >::result_type cyl_neumann_zero(T v, int m);

   template <class T, class OutputIterator>
-   OutputIterator cyl_neumann_zero(T v,
+   BOOST_MATH_GPU_ENABLED OutputIterator cyl_neumann_zero(T v,
                         int start_index,
                         unsigned number_of_zeros,
                         OutputIterator out_it);

   template <class T, class OutputIterator, class Policy>
-   OutputIterator cyl_neumann_zero(T v,
+   BOOST_MATH_GPU_ENABLED OutputIterator cyl_neumann_zero(T v,
                         int start_index,
                         unsigned number_of_zeros,
                         OutputIterator out_it,
@@ -1400,10 +1404,10 @@ namespace boost
 \
   using boost::math::max_factorial;\
   template <class RT>\
-   inline RT factorial(unsigned int i) { return boost::math::factorial<RT>(i, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline RT factorial(unsigned int i) { return boost::math::factorial<RT>(i, Policy()); }\
   using boost::math::unchecked_factorial;\
   template <class RT>\
-   inline RT double_factorial(unsigned i){ return boost::math::double_factorial<RT>(i, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline RT double_factorial(unsigned i){ return boost::math::double_factorial<RT>(i, Policy()); }\
   template <class RT>\
   inline boost::math::tools::promote_args_t<RT> falling_factorial(RT x, unsigned n){ return boost::math::falling_factorial(x, n, Policy()); }\
   template <class RT>\
@@ -1465,7 +1469,7 @@ namespace boost
   \
   template <class T1, class T2>\
   inline boost::math::tools::promote_args_t<T1, T2> \
-   hypot(T1 x, T2 y){ return boost::math::hypot(x, y, Policy()); }\
+   BOOST_MATH_GPU_ENABLED hypot(T1 x, T2 y){ return boost::math::hypot(x, y, Policy()); }\
 \
   template <class RT>\
   inline boost::math::tools::promote_args_t<RT> cbrt(RT z){ return boost::math::cbrt(z, Policy()); }\
@@ -1487,7 +1491,7 @@ namespace boost
   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> sqrt1pm1(const T& val){ return boost::math::sqrt1pm1(val, Policy()); }\
 \
   template <class T>\
-   inline boost::math::tools::promote_args_t<T> sinc_pi(T x){ return boost::math::sinc_pi(x, Policy()); }\
+   BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t<T> sinc_pi(T x){ return boost::math::sinc_pi(x, Policy()); }\
 \
   template <class T>\
   inline boost::math::tools::promote_args_t<T> sinhc_pi(T x){ return boost::math::sinhc_pi(x, Policy()); }\
@@ -1817,6 +1821,6 @@ template <class OutputIterator, class T>\



-
+#endif // BOOST_MATH_HAS_NVRTC

 #endif // BOOST_MATH_SPECIAL_MATH_FWD_HPP
--- a/include/boost/math/special_functions/next.hpp
+++ b/include/boost/math/special_functions/next.hpp
@@ -10,6 +10,11 @@
 #pragma once
 #endif

+#include <boost/math/tools/config.hpp>
+
+// TODO(mborland): Need to remove recurrsion from these algos
+#ifndef BOOST_MATH_HAS_NVRTC
+
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>
@@ -920,4 +925,6 @@ inline typename tools::promote_args<T>::type float_advance(const T& val, int dis

 }} // boost math namespaces

+#endif
+
 #endif // BOOST_MATH_SPECIAL_NEXT_HPP
--- a/include/boost/math/special_functions/round.hpp
+++ b/include/boost/math/special_functions/round.hpp
@@ -273,6 +273,30 @@ BOOST_MATH_GPU_ENABLED float round(float x, const Policy&)
   return ::roundf(x);
 }

+template <typename T>
+BOOST_MATH_GPU_ENABLED int iround(T x)
+{
+   return static_cast<int>(::lround(x));
+}
+
+template <>
+BOOST_MATH_GPU_ENABLED int iround(float x)
+{
+   return static_cast<int>(::lroundf(x));
+}
+
+template <typename T, typename Policy>
+BOOST_MATH_GPU_ENABLED int iround(T x, const Policy&)
+{
+   return static_cast<int>(::lround(x));
+}
+
+template <typename Policy>
+BOOST_MATH_GPU_ENABLED int iround(float x, const Policy&)
+{
+   return static_cast<int>(::lroundf(x));
+}
+
 template <typename T>
 BOOST_MATH_GPU_ENABLED long lround(T x)
 {
--- a/include/boost/math/special_functions/sinc.hpp
+++ b/include/boost/math/special_functions/sinc.hpp
@@ -17,13 +17,13 @@

 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/promotion.hpp>
 #include <boost/math/policies/policy.hpp>
-#include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>
-#include <limits>
-#include <string>
-#include <stdexcept>
-#include <cmath>
+
+#ifndef BOOST_MATH_HAS_NVRTC
+#include <boost/math/special_functions/math_fwd.hpp>
+#endif

 // These are the the "Sinus Cardinal" functions.

@@ -36,7 +36,7 @@ namespace boost
        // This is the "Sinus Cardinal" of index Pi.

        template<typename T>
-        inline T    sinc_pi_imp(const T x)
+        BOOST_MATH_GPU_ENABLED inline T    sinc_pi_imp(const T x)
        {
            BOOST_MATH_STD_USING

@@ -44,7 +44,7 @@ namespace boost
            {
               return 0;
            }
-            else if (abs(x) >= 3.3 * tools::forth_root_epsilon<T>())
+            else if (abs(x) >= T(3.3) * tools::forth_root_epsilon<T>())
            {
                return(sin(x)/x);
            }
@@ -58,24 +58,23 @@ namespace boost
       } // namespace detail

       template <class T>
-       inline typename tools::promote_args<T>::type sinc_pi(T x)
+       BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type sinc_pi(T x)
       {
          typedef typename tools::promote_args<T>::type result_type;
          return detail::sinc_pi_imp(static_cast<result_type>(x));
       }

       template <class T, class Policy>
-       inline typename tools::promote_args<T>::type sinc_pi(T x, const Policy&)
+       BOOST_MATH_GPU_ENABLED inline typename tools::promote_args<T>::type sinc_pi(T x, const Policy&)
       {
          typedef typename tools::promote_args<T>::type result_type;
          return detail::sinc_pi_imp(static_cast<result_type>(x));
       }

        template<typename T, template<typename> class U>
-        inline U<T>    sinc_pi(const U<T> x)
+        BOOST_MATH_GPU_ENABLED inline U<T>    sinc_pi(const U<T> x)
        {
            BOOST_MATH_STD_USING
-            using    ::std::numeric_limits;

            T const    taylor_0_bound = tools::epsilon<T>();
            T const    taylor_2_bound = tools::root_epsilon<T>();
@@ -88,11 +87,11 @@ namespace boost
            else
            {
                // approximation by taylor series in x at 0 up to order 0
-#ifdef __MWERKS__
+                #ifdef __MWERKS__
                U<T>    result = static_cast<U<T> >(1);
-#else
+                #else
                U<T>    result = U<T>(1);
-#endif
+                #endif

                if    (abs(x) >= taylor_0_bound)
                {
@@ -113,7 +112,7 @@ namespace boost
        }

        template<typename T, template<typename> class U, class Policy>
-        inline U<T>    sinc_pi(const U<T> x, const Policy&)
+        BOOST_MATH_GPU_ENABLED inline U<T>    sinc_pi(const U<T> x, const Policy&)
        {
           return sinc_pi(x);
        }
--- a/include/boost/math/special_functions/ulp.hpp
+++ b/include/boost/math/special_functions/ulp.hpp
@@ -14,6 +14,7 @@
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>
 #include <boost/math/special_functions/next.hpp>
+#include <boost/math/tools/precision.hpp>

 namespace boost{ namespace math{ namespace detail{

--- a/include/boost/math/tools/array.hpp
+++ b/include/boost/math/tools/array.hpp
@@ -0,0 +1,41 @@
+//  Copyright (c) 2024 Matt Borland
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+//  Regular use of std::array functions can not be used on 
+//  GPU platforms like CUDA since they are missing the __device__ marker
+//  Alias as needed to get correct support
+
+#ifndef BOOST_MATH_TOOLS_ARRAY_HPP
+#define BOOST_MATH_TOOLS_ARRAY_HPP
+
+#include <boost/math/tools/config.hpp>
+
+#ifdef BOOST_MATH_ENABLE_CUDA
+
+#include <cuda/std/array>
+
+namespace boost {
+namespace math {
+
+using cuda::std::array;
+
+} // namespace math
+} // namespace boost
+
+#else
+
+#include <array>
+
+namespace boost {
+namespace math {
+
+using std::array;
+
+} // namespace math
+} // namespace boost
+
+#endif // BOOST_MATH_ENABLE_CUDA
+
+#endif // BOOST_MATH_TOOLS_ARRAY_HPP
--- a/include/boost/math/tools/config.hpp
+++ b/include/boost/math/tools/config.hpp
@@ -676,6 +676,7 @@ namespace boost{ namespace math{
 #include <cuda/std/type_traits>
 #include <cuda/std/utility>
 #include <cuda/std/cstdint>
+#include <cuda/std/array>

 #  define BOOST_MATH_CUDA_ENABLED __host__ __device__
 #  define BOOST_MATH_HAS_GPU_SUPPORT
@@ -733,7 +734,7 @@ BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b;
 template <class T>
 BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_min(const T& a, const T& b) { return a < b ? a : b; }
 template <class T>
-BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { return a > b ? a : b; }
+BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return a > b ? a : b; }

 #define BOOST_MATH_GPU_SAFE_SWAP(a, b) gpu_safe_swap(a, b)
 #define BOOST_MATH_GPU_SAFE_MIN(a, b) gpu_safe_min(a, b)
@@ -794,10 +795,13 @@ BOOST_MATH_GPU_ENABLED constexpr T cuda_safe_max(const T& a, const T& b) { retur
 #define BOOST_MATH_NOEXCEPT(T) noexcept(boost::math::is_floating_point_v<T>)
 #define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T) 
 #define BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T) 
+#define BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(T) 
 #define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast<T>(V)
 #define BOOST_MATH_FORCEINLINE __forceinline__
 #define BOOST_MATH_STD_USING  
 #define BOOST_MATH_IF_CONSTEXPR if constexpr
+#define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point<T>::value)
+#define BOOST_MATH_CONSTEXPR_TABLE_FUNCTION constexpr

 // This should be defined to nothing but since it is not specifically a math macro
 // we need to undef before proceeding
@@ -829,6 +833,9 @@ BOOST_MATH_GPU_ENABLED constexpr void gpu_safe_swap(T& a, T& b) { T t(a); a = b;
 #  define BOOST_MATH_INLINE_CONSTEXPR constexpr
 #endif

+#define BOOST_MATH_INSTRUMENT_VARIABLE(x)
+#define BOOST_MATH_INSTRUMENT_CODE(x) 
+
 #endif // NVRTC

 #endif // BOOST_MATH_TOOLS_CONFIG_HPP
--- a/include/boost/math/tools/roots.hpp
+++ b/include/boost/math/tools/roots.hpp
@@ -1,4 +1,5 @@
 //  (C) Copyright John Maddock 2006.
+//  (C) Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -11,22 +12,19 @@
 #endif

 #include <boost/math/tools/config.hpp>
-
-#ifndef BOOST_MATH_HAS_NVRTC // Disabled for now
-
 #include <boost/math/tools/complex.hpp> // test for multiprecision types in complex Newton
-
-#include <utility>
-#include <cmath>
-#include <tuple>
-#include <cstdint>
-
-#include <boost/math/tools/cxx03_warn.hpp>
-
+#include <boost/math/tools/type_traits.hpp>
+#include <boost/math/tools/cstdint.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/special_functions/sign.hpp>
+#include <boost/math/policies/policy.hpp>
+#include <boost/math/policies/error_handling.hpp>
+
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
 #include <boost/math/special_functions/next.hpp>
 #include <boost/math/tools/toms748_solve.hpp>
-#include <boost/math/policies/error_handling.hpp>
+#endif

 namespace boost {
 namespace math {
@@ -37,11 +35,11 @@ namespace detail {
 namespace dummy {

   template<int n, class T>
-   typename T::value_type get(const T&) BOOST_MATH_NOEXCEPT(T);
+   BOOST_MATH_GPU_ENABLED typename T::value_type get(const T&) BOOST_MATH_NOEXCEPT(T);
 }

 template <class Tuple, class T>
-void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T)
+BOOST_MATH_GPU_ENABLED void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T)
 {
   using dummy::get;
   // Use ADL to find the right overload for get:
@@ -49,7 +47,7 @@ void unpack_tuple(const Tuple& t, T& a, T& b) BOOST_MATH_NOEXCEPT(T)
   b = get<1>(t);
 }
 template <class Tuple, class T>
-void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T)
+BOOST_MATH_GPU_ENABLED void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T)
 {
   using dummy::get;
   // Use ADL to find the right overload for get:
@@ -59,7 +57,7 @@ void unpack_tuple(const Tuple& t, T& a, T& b, T& c) BOOST_MATH_NOEXCEPT(T)
 }

 template <class Tuple, class T>
-inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T)
+BOOST_MATH_GPU_ENABLED inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T)
 {
   using dummy::get;
   // Rely on ADL to find the correct overload of get:
@@ -67,26 +65,30 @@ inline void unpack_0(const Tuple& t, T& val) BOOST_MATH_NOEXCEPT(T)
 }

 template <class T, class U, class V>
-inline void unpack_tuple(const std::pair<T, U>& p, V& a, V& b) BOOST_MATH_NOEXCEPT(T)
+BOOST_MATH_GPU_ENABLED inline void unpack_tuple(const boost::math::pair<T, U>& p, V& a, V& b) BOOST_MATH_NOEXCEPT(T)
 {
   a = p.first;
   b = p.second;
 }
 template <class T, class U, class V>
-inline void unpack_0(const std::pair<T, U>& p, V& a) BOOST_MATH_NOEXCEPT(T)
+BOOST_MATH_GPU_ENABLED inline void unpack_0(const boost::math::pair<T, U>& p, V& a) BOOST_MATH_NOEXCEPT(T)
 {
   a = p.first;
 }

 template <class F, class T>
-void handle_zero_derivative(F f,
+BOOST_MATH_GPU_ENABLED void handle_zero_derivative(F f,
   T& last_f0,
   const T& f0,
   T& delta,
   T& result,
   T& guess,
   const T& min,
-   const T& max) noexcept(BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
+   const T& max) noexcept(BOOST_MATH_IS_FLOAT(T) 
+   #ifndef BOOST_MATH_HAS_GPU_SUPPORT
+   && noexcept(std::declval<F>()(std::declval<T>()))
+   #endif
+   )
 {
   if (last_f0 == 0)
   {
@@ -132,25 +134,29 @@ void handle_zero_derivative(F f,
 } // namespace

 template <class F, class T, class Tol, class Policy>
-std::pair<T, T> bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter, const Policy& pol) noexcept(policies::is_noexcept_error_policy<Policy>::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
+BOOST_MATH_GPU_ENABLED boost::math::pair<T, T> bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter, const Policy& pol) noexcept(policies::is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T) 
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<F>()(std::declval<T>()))
+#endif
+)
 {
   T fmin = f(min);
   T fmax = f(max);
   if (fmin == 0)
   {
      max_iter = 2;
-      return std::make_pair(min, min);
+      return boost::math::make_pair(min, min);
   }
   if (fmax == 0)
   {
      max_iter = 2;
-      return std::make_pair(max, max);
+      return boost::math::make_pair(max, max);
   }

   //
   // Error checking:
   //
-   static const char* function = "boost::math::tools::bisect<%1%>";
+   constexpr auto function = "boost::math::tools::bisect<%1%>";
   if (min >= max)
   {
      return boost::math::detail::pair_from_single(policies::raise_evaluation_error(function,
@@ -200,29 +206,41 @@ std::pair<T, T> bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter, con
   std::cout << "Bisection required " << max_iter << " iterations.\n";
 #endif

-   return std::make_pair(min, max);
+   return boost::math::make_pair(min, max);
 }

 template <class F, class T, class Tol>
-inline std::pair<T, T> bisect(F f, T min, T max, Tol tol, std::uintmax_t& max_iter)  noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
+BOOST_MATH_GPU_ENABLED inline boost::math::pair<T, T> bisect(F f, T min, T max, Tol tol, boost::math::uintmax_t& max_iter)  noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value && BOOST_MATH_IS_FLOAT(T)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<F>()(std::declval<T>()))
+#endif
+)
 {
   return bisect(f, min, max, tol, max_iter, policies::policy<>());
 }

 template <class F, class T, class Tol>
-inline std::pair<T, T> bisect(F f, T min, T max, Tol tol) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
+BOOST_MATH_GPU_ENABLED inline boost::math::pair<T, T> bisect(F f, T min, T max, Tol tol) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value && BOOST_MATH_IS_FLOAT(T) 
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<F>()(std::declval<T>()))
+#endif
+)
 {
-   std::uintmax_t m = (std::numeric_limits<std::uintmax_t>::max)();
+   boost::math::uintmax_t m = (boost::math::numeric_limits<boost::math::uintmax_t>::max)();
   return bisect(f, min, max, tol, m, policies::policy<>());
 }


 template <class F, class T>
-T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
+BOOST_MATH_GPU_ENABLED T newton_raphson_iterate(F f, T guess, T min, T max, int digits, boost::math::uintmax_t& max_iter) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value && BOOST_MATH_IS_FLOAT(T)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<F>()(std::declval<T>()))
+#endif
+)
 {
   BOOST_MATH_STD_USING

-   static const char* function = "boost::math::tools::newton_raphson_iterate<%1%>";
+   constexpr auto function = "boost::math::tools::newton_raphson_iterate<%1%>";
   if (min > max)
   {
      return policies::raise_evaluation_error(function, "Range arguments in wrong order in boost::math::tools::newton_raphson_iterate(first arg=%1%)", min, boost::math::policies::policy<>());
@@ -249,7 +267,7 @@ T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t&
   T max_range_f = 0;
   T min_range_f = 0;

-   std::uintmax_t count(max_iter);
+   boost::math::uintmax_t count(max_iter);

 #ifdef BOOST_MATH_INSTRUMENT
   std::cout << "Newton_raphson_iterate, guess = " << guess << ", min = " << min << ", max = " << max
@@ -336,12 +354,22 @@ T newton_raphson_iterate(F f, T guess, T min, T max, int digits, std::uintmax_t&
 }

 template <class F, class T>
-inline T newton_raphson_iterate(F f, T guess, T min, T max, int digits) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value&& BOOST_MATH_IS_FLOAT(T) && noexcept(std::declval<F>()(std::declval<T>())))
+BOOST_MATH_GPU_ENABLED inline T newton_raphson_iterate(F f, T guess, T min, T max, int digits) noexcept(policies::is_noexcept_error_policy<policies::policy<> >::value && BOOST_MATH_IS_FLOAT(T)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<F>()(std::declval<T>()))
+#endif
+)
 {
-   std::uintmax_t m = (std::numeric_limits<std::uintmax_t>::max)();
+   boost::math::uintmax_t m = (boost::math::numeric_limits<boost::math::uintmax_t>::max)();
   return newton_raphson_iterate(f, guess, min, max, digits, m);
 }

+// TODO(mborland): Disabled for now
+// Recursion needs to be removed, but there is no demand at this time
+#ifdef BOOST_MATH_HAS_NVRTC
+}}} // Namespaces
+#else
+
 namespace detail {

   struct halley_step
--- a/include/boost/math/tools/series.hpp
+++ b/include/boost/math/tools/series.hpp
@@ -10,10 +10,10 @@
 #pragma once
 #endif

-#include <cmath>
-#include <cstdint>
-#include <limits>
+
 #include <boost/math/tools/config.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/cstdint.hpp>

 namespace boost{ namespace math{ namespace tools{

@@ -21,13 +21,17 @@ namespace boost{ namespace math{ namespace tools{
 // Simple series summation come first:
 //
 template <class Functor, class U, class V>
-BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms, const V& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms, const V& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<Functor>()())
+#endif
+)
 {
   BOOST_MATH_STD_USING

   typedef typename Functor::result_type result_type;

-   std::uintmax_t counter = max_terms;
+   boost::math::uintmax_t counter = max_terms;

   result_type result = init_value;
   result_type next_term;
@@ -44,14 +48,22 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor&
 }

 template <class Functor, class U>
-BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<Functor>()())
+#endif
+)
 {
   typename Functor::result_type init_value = 0;
   return sum_series(func, factor, max_terms, init_value);
 }

 template <class Functor, class U>
-BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, std::uintmax_t& max_terms, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<Functor>()())
+#endif
+)
 {
   BOOST_MATH_STD_USING
   typedef typename Functor::result_type result_type;
@@ -60,17 +72,25 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor&
 }

 template <class Functor>
-BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) 
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<Functor>()())
+#endif
+)
 {
   BOOST_MATH_STD_USING
   typedef typename Functor::result_type result_type;
-   std::uintmax_t iters = (std::numeric_limits<std::uintmax_t>::max)();
+   boost::math::uintmax_t iters = (boost::math::numeric_limits<boost::math::uintmax_t>::max)();
   result_type init_val = 0;
   return sum_series(func, bits, iters, init_val);
 }

 template <class Functor>
-BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<Functor>()())
+#endif
+)
 {
   BOOST_MATH_STD_USING
   typedef typename Functor::result_type result_type;
@@ -79,23 +99,31 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor&
 }

 template <class Functor, class U>
-BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+BOOST_MATH_GPU_ENABLED inline typename Functor::result_type sum_series(Functor& func, int bits, const U& init_value) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<Functor>()())
+#endif
+)
 {
   BOOST_MATH_STD_USING
-   std::uintmax_t iters = (std::numeric_limits<std::uintmax_t>::max)();
+   boost::math::uintmax_t iters = (boost::math::numeric_limits<boost::math::uintmax_t>::max)();
   return sum_series(func, bits, iters, init_value);
 }
 //
 // Checked summation:
 //
 template <class Functor, class U, class V>
-BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(Functor& func, const U& factor, std::uintmax_t& max_terms, const V& init_value, V& norm) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(Functor& func, const U& factor, boost::math::uintmax_t& max_terms, const V& init_value, V& norm) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<Functor>()())
+#endif
+)
 {
   BOOST_MATH_STD_USING

   typedef typename Functor::result_type result_type;

-   std::uintmax_t counter = max_terms;
+   boost::math::uintmax_t counter = max_terms;

   result_type result = init_value;
   result_type next_term;
@@ -125,7 +153,11 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type checked_sum_series(F
 // in any case the result is still much better than a naive summation.
 //
 template <class Functor>
-BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<Functor>()())
+#endif
+)
 {
   BOOST_MATH_STD_USING

@@ -148,13 +180,17 @@ BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Fun
 }

 template <class Functor>
-BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits, std::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+BOOST_MATH_GPU_ENABLED inline typename Functor::result_type kahan_sum_series(Functor& func, int bits, boost::math::uintmax_t& max_terms) noexcept(BOOST_MATH_IS_FLOAT(typename Functor::result_type)
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
+&& noexcept(std::declval<Functor>()())
+#endif
+)
 {
   BOOST_MATH_STD_USING

   typedef typename Functor::result_type result_type;

-   std::uintmax_t counter = max_terms;
+   boost::math::uintmax_t counter = max_terms;

   result_type factor = ldexp(result_type(1), bits);
   result_type result = func();
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -17,12 +17,14 @@ run test_arcsine_pdf_float.cu ;
 run test_arcsine_quan_double.cu ;
 run test_arcsine_quan_float.cu ;
 run test_arcsine_range_support_double.cu ;
+
 run test_bernoulli_cdf_double.cu ;
 run test_bernoulli_cdf_float.cu ;
 run test_bernoulli_pdf_double.cu ;
 run test_bernoulli_pdf_float.cu ;
 run test_bernoulli_range_support_double.cu ;
 run test_bernoulli_range_support_float.cu ;
+
 run test_cauchy_cdf_double.cu ;
 run test_cauchy_cdf_float.cu ;
 run test_cauchy_pdf_double.cu ;
@@ -31,6 +33,7 @@ run test_cauchy_quan_double.cu ;
 run test_cauchy_quan_float.cu ;
 run test_cauchy_range_support_double.cu ;
 run test_cauchy_range_support_float.cu ;
+
 run test_exponential_cdf_double.cu ;
 run test_exponential_cdf_float.cu ;
 run test_exponential_pdf_double.cu ;
@@ -39,40 +42,47 @@ run test_exponential_quan_double.cu ;
 run test_exponential_quan_float.cu ;
 run test_exponential_range_support_double.cu ;
 run test_exponential_range_support_float.cu ;
+
 run test_extreme_value_cdf_double.cu ;
 run test_extreme_value_cdf_float.cu ;
 run test_extreme_value_pdf_double.cu ;
 run test_extreme_value_pdf_float.cu ;
 run test_extreme_value_quan_double.cu ;
 run test_extreme_value_quan_float.cu ;
+
 run test_holtsmark_cdf_double.cu ;
 run test_holtsmark_cdf_float.cu ;
 run test_holtsmark_pdf_double.cu ;
 run test_holtsmark_pdf_float.cu ;
+
 run test_landau_cdf_double.cu ;
 run test_landau_cdf_float.cu ;
 run test_landau_pdf_double.cu ;
 run test_landau_pdf_float.cu ;
 run test_landau_quan_double.cu;
 run test_landau_quan_float.cu ;
+
 run test_laplace_cdf_double.cu ;
 run test_laplace_cdf_float.cu ;
 run test_laplace_pdf_double.cu ;
 run test_laplace_pdf_float.cu ;
 run test_laplace_quan_double.cu ;
 run test_laplace_quan_float.cu ;
+
 run test_logistic_cdf_double.cu ;
 run test_logistic_cdf_float.cu ;
 run test_logistic_pdf_double.cu ;
 run test_logistic_pdf_float.cu ;
 run test_logistic_quan_double.cu ;
 run test_logistic_quan_float.cu ;
+
 run test_mapairy_cdf_double.cu ;
 run test_mapairy_cdf_float.cu ;
 run test_mapairy_pdf_double.cu ;
 run test_mapairy_pdf_float.cu ;
 run test_mapairy_quan_double.cu ;
 run test_mapairy_quan_float.cu ;
+
 run test_saspoint5_cdf_double.cu ;
 run test_saspoint5_cdf_float.cu ;
 run test_saspoint5_pdf_double.cu ;
@@ -81,17 +91,52 @@ run test_saspoint5_quan_double.cu ;
 run test_saspoint5_quan_float.cu ;

 # Special Functions
-# run test_beta_simple.cpp ;
 run test_beta_double.cu ;
 run test_beta_float.cu ;
+
+run test_bessel_i0_double.cu ;
+run test_bessel_i0_float.cu ;
+run test_bessel_i1_double.cu ;
+run test_bessel_i1_float.cu ;
+run test_bessel_j0_double.cu ;
+run test_bessel_j0_float.cu ;
+run test_bessel_j1_double.cu ;
+run test_bessel_j1_float.cu ;
+run test_bessel_k0_double.cu ;
+run test_bessel_k0_float.cu ;
+run test_bessel_k1_double.cu ;
+run test_bessel_k1_float.cu ;
+run test_bessel_kn_double.cu ;
+run test_bessel_kn_float.cu ;
+run test_bessel_y0_double.cu ;
+run test_bessel_y0_float.cu ;
+run test_bessel_y1_double.cu ;
+run test_bessel_y1_float.cu ;
+run test_cyl_bessel_i_double.cu ;
+run test_cyl_bessel_i_float.cu ;
+run test_cyl_bessel_j_double.cu ;
+run test_cyl_bessel_j_float.cu ;
+run test_cyl_bessel_k_double.cu ;
+run test_cyl_bessel_k_float.cu ;
+run test_sph_bessel_double.cu ;
+run test_sph_bessel_float.cu ;
+run test_cyl_neumann_double.cu ;
+run test_cyl_neumann_float.cu ;
+run test_sph_neumann_double.cu ;
+run test_sph_neumann_float.cu ;
+
 run test_cbrt_double.cu ;
 run test_cbrt_float.cu ;
+
 run test_changesign_double.cu ;
 run test_changesign_float.cu ;
+
 run test_cos_pi_double.cu ;
 run test_cos_pi_float.cu ;
+
 run test_digamma_double.cu ;
 run test_digamma_float.cu ;
+
 run test_erf_double.cu ;
 run test_erf_float.cu ;
 run test_erf_inv_double.cu ;
@@ -100,21 +145,29 @@ run test_erfc_double.cu ;
 run test_erfc_float.cu ;
 run test_erfc_inv_double.cu ;
 run test_erfc_inv_float.cu ;
+
 run test_expm1_double.cu ;
 run test_expm1_float.cu ;
+
 run test_lgamma_double.cu ;
 run test_lgamma_float.cu ;
-run test_log1p_double.cu ;
-run test_log1p_float.cu ;
-run test_modf_double.cu ;
-run test_modf_float.cu ;
-run test_round_double.cu ;
-run test_round_float.cu ;
-run test_sin_pi_double.cu ; 
-run test_sin_pi_float.cu ;
 run test_tgamma_double.cu ;
 run test_tgamma_float.cu ;
+
+run test_log1p_double.cu ;
+run test_log1p_float.cu ;
+
+run test_modf_double.cu ;
+run test_modf_float.cu ;
+
+run test_round_double.cu ;
+run test_round_float.cu ;
+
+run test_sin_pi_double.cu ; 
+run test_sin_pi_float.cu ;
+
 run test_trigamma_double.cu ;
 run test_trigamma_float.cu ;
+
 run test_trunc_double.cu ;
 run test_trunc_float.cu ;
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -90,12 +90,47 @@ run test_saspoint5_quan_nvrtc_float.cpp ;
 # Special Functions
 run test_beta_nvrtc_double.cpp ;
 run test_beta_nvrtc_float.cpp ;
+
+run test_bessel_i0_nvrtc_double.cpp ;
+run test_bessel_i0_nvrtc_float.cpp ;
+run test_bessel_i1_nvrtc_double.cpp ;
+run test_bessel_i1_nvrtc_float.cpp ;
+run test_bessel_j0_nvrtc_double.cpp ;
+run test_bessel_j0_nvrtc_float.cpp ;
+run test_bessel_j1_nvrtc_double.cpp ;
+run test_bessel_j1_nvrtc_float.cpp ;
+run test_bessel_k0_nvrtc_double.cpp ;
+run test_bessel_k0_nvrtc_float.cpp ;
+run test_bessel_k1_nvrtc_double.cpp ;
+run test_bessel_k1_nvrtc_float.cpp ;
+run test_bessel_kn_nvrtc_double.cpp ;
+run test_bessel_kn_nvrtc_float.cpp ;
+run test_bessel_y0_nvrtc_double.cpp ;
+run test_bessel_y0_nvrtc_float.cpp ;
+run test_bessel_y1_nvrtc_double.cpp ;
+run test_bessel_y1_nvrtc_float.cpp ;
+run test_cyl_bessel_i_nvrtc_double.cpp ;
+run test_cyl_bessel_i_nvrtc_float.cpp ;
+run test_cyl_bessel_j_nvrtc_double.cpp ;
+run test_cyl_bessel_j_nvrtc_float.cpp ;
+run test_cyl_bessel_k_nvrtc_double.cpp ;
+run test_cyl_bessel_k_nvrtc_float.cpp ;
+run test_sph_bessel_nvrtc_double.cpp ;
+run test_sph_bessel_nvrtc_float.cpp ;
+run test_cyl_neumann_nvrtc_double.cpp ;
+run test_cyl_neumann_nvrtc_float.cpp ;
+run test_sph_neumann_nvrtc_double.cpp ;
+run test_sph_neumann_nvrtc_float.cpp ;
+
 run test_cbrt_nvrtc_double.cpp ;
 run test_cbrt_nvrtc_float.cpp ;
+
 run test_cos_pi_nvrtc_double.cpp ;
 run test_cos_pi_nvrtc_float.cpp ;
+
 run test_digamma_nvrtc_double.cpp ;
 run test_digamma_nvrtc_float.cpp ;
+
 run test_erf_nvrtc_double.cpp ;
 run test_erf_nvrtc_float.cpp ;
 run test_erfc_nvrtc_double.cpp ;
@@ -104,22 +139,32 @@ run test_erf_inv_nvrtc_double.cpp ;
 run test_erf_inv_nvrtc_float.cpp ;
 run test_erfc_inv_nvrtc_double.cpp ;
 run test_erfc_inv_nvrtc_float.cpp ;
+
 run test_expm1_nvrtc_double.cpp ;
 run test_expm1_nvrtc_float.cpp ;
+
 run test_fpclassify_nvrtc_double.cpp ;
 run test_fpclassify_nvrtc_float.cpp ;
+
 run test_gamma_nvrtc_double.cpp ;
 run test_gamma_nvrtc_float.cpp ;
+
 run test_log1p_nvrtc_double.cpp ;
 run test_log1p_nvrtc_float.cpp ;
+
 run test_modf_nvrtc_double.cpp ;
 run test_modf_nvrtc_float.cpp ;
+
 run test_round_nvrtc_double.cpp ;
 run test_round_nvrtc_float.cpp ;
+
 run test_sign_nvrtc_double.cpp ;
 run test_sign_nvrtc_float.cpp ;
+
 run test_sin_pi_nvrtc_double.cpp ;
 run test_sin_pi_nvrtc_float.cpp ;
+
 run test_trigamma_nvrtc_double.cpp ;
 run test_trigamma_nvrtc_float.cpp ;
+
 run test_trunc_nvrtc_double.cpp ;
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -25,6 +25,10 @@ run test_saspoint5.cpp ;
 # Special Functions
 run pow_test.cpp ;
 run test_beta_simple.cpp ;
+run test_bessel_i.cpp ;
+run test_bessel_j.cpp ;
+run test_bessel_k.cpp ;
+run test_bessel_y.cpp ;
 run test_cbrt.cpp ;
 run test_sign.cpp ;
 run test_round.cpp ;
--- a/test/test_bessel_i.cpp
+++ b/test/test_bessel_i.cpp
@@ -3,7 +3,13 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#else
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+#include <boost/math/tools/config.hpp>
+#endif
+
 #include "test_bessel_i.hpp"

 //
@@ -82,7 +88,11 @@ void expected_results()
      "linux",                       // platform
      largest_type,                  // test type(s)
      ".*Random.*",                    // test data group
+      #ifdef SYCL_LANGUAGE_VERSION
+      ".*", 600, 200);
+      #else
      ".*", 400, 200);               // test function
+      #endif

   add_expected_result(
      "GNU.*",                       // compiler
@@ -111,7 +121,11 @@ void expected_results()
      ".*",                          // platform
      largest_type,                  // test type(s)
      ".*",                          // test data group
+      #ifdef SYCL_LANGUAGE_VERSION
+      ".*", 400, 200);
+      #else
      ".*", 20, 10);                 // test function
+      #endif
   //
   // Set error rates a little higher for real_concept - 
   // now that we use a series approximation for small z
--- a/test/test_bessel_i.hpp
+++ b/test/test_bessel_i.hpp
@@ -9,6 +9,7 @@
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/special_functions/bessel.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
 #include "functor.hpp"
@@ -180,7 +181,10 @@ void test_bessel(T, const char* name)
    //
    // Special cases for full coverage:
    //
+    #ifndef BOOST_MATH_NO_EXCEPTIONS
    BOOST_CHECK_THROW(boost::math::cyl_bessel_i(T(-2.5), T(-2.5)), std::domain_error);
+    #endif
+
    BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(0), T(0)), T(1));
    BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(10), T(0)), T(0));
    BOOST_CHECK_EQUAL(boost::math::cyl_bessel_i(T(-10), T(0)), T(0));
@@ -197,10 +201,12 @@ void test_bessel(T, const char* name)
       }
    }
    T tolerance = boost::math::tools::epsilon<T>() * 100;
+#ifndef SYCL_LANGUAGE_VERSION
    if ((boost::math::tools::digits<T>() <= std::numeric_limits<double>::digits) && (std::numeric_limits<T>::max_exponent > 1000))
    {
       BOOST_CHECK_CLOSE_FRACTION(boost::math::cyl_bessel_i(T(0.5), T(710)), SC_(3.3447452278080108123142599104927325061327359278058601201179e306), tolerance);
    }
+#endif
 #if LDBL_MAX_EXP >= 11356
    BOOST_IF_CONSTEXPR (std::numeric_limits<T>::max_exponent >= 11356)
    {
--- a/test/test_bessel_i0_double.cu
+++ b/test/test_bessel_i0_double.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_i0(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_i0(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_i0_float.cu
+++ b/test/test_bessel_i0_float.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_i0(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_i0(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_i0_nvrtc_double.cpp
+++ b/test/test_bessel_i0_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_i0.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_i0.hpp>
+extern "C" __global__ 
+void test_bessel_i0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_i0(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i0_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_i0_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i0_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_i0(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_i0_nvrtc_float.cpp
+++ b/test/test_bessel_i0_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_i0.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_i0.hpp>
+extern "C" __global__ 
+void test_bessel_i0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_i0(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i0_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_i0_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i0_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_i0(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_i1_double.cu
+++ b/test/test_bessel_i1_double.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_i1(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_i1(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_i1_float.cu
+++ b/test/test_bessel_i1_float.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_i1(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_i1(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_i1_nvrtc_double.cpp
+++ b/test/test_bessel_i1_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_i1.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_i1.hpp>
+extern "C" __global__ 
+void test_bessel_i1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_i1(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_i1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_i1(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_i1_nvrtc_float.cpp
+++ b/test/test_bessel_i1_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_i1.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_i1.hpp>
+extern "C" __global__ 
+void test_bessel_i1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_i1(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_i1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_i1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_i1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_i1(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_j.cpp
+++ b/test/test_bessel_j.cpp
@@ -3,7 +3,12 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#else
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+#include <boost/math/tools/config.hpp>
+#endif

 #include "test_bessel_j.hpp"

--- a/test/test_bessel_j.hpp
+++ b/test/test_bessel_j.hpp
@@ -9,6 +9,7 @@
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/special_functions/bessel.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
@@ -279,7 +280,9 @@ void test_bessel(T, const char* name)
    BOOST_MATH_CHECK_THROW(boost::math::sph_bessel(2, T(-2.0)), std::domain_error);
    BOOST_CHECK_EQUAL(boost::math::cyl_bessel_j(T(0), T(2.5)), boost::math::cyl_bessel_j(T(0), T(-2.5)));
    BOOST_CHECK_EQUAL(boost::math::cyl_bessel_j(T(1), T(2.5)), -boost::math::cyl_bessel_j(T(1), T(-2.5)));
+    #ifndef SYCL_LANGUAGE_VERSION
    BOOST_CHECK_CLOSE_FRACTION(boost::math::cyl_bessel_j(364, T(38.5)), SC_(1.793940496519190500748409872348034004417458734118663909894e-309), tolerance);
+    #endif
    //
    // Special cases at infinity:
    //
--- a/test/test_bessel_j0_double.cu
+++ b/test/test_bessel_j0_double.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_j0(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_j0(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_j0_float.cu
+++ b/test/test_bessel_j0_float.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_j0(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_j0(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_j0_nvrtc_double.cpp
+++ b/test/test_bessel_j0_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_j0.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_j0.hpp>
+extern "C" __global__ 
+void test_bessel_j0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_j0(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j0_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_j0_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j0_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_j0(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_j0_nvrtc_float.cpp
+++ b/test/test_bessel_j0_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_j0.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_j0.hpp>
+extern "C" __global__ 
+void test_bessel_j0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_j0(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j0_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_j0_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j0_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_j0(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_j1_double.cu
+++ b/test/test_bessel_j1_double.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_j1(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_j1(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_j1_float.cu
+++ b/test/test_bessel_j1_float.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_j1(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_j1(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_j1_nvrtc_double.cpp
+++ b/test/test_bessel_j1_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_j1.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_j1.hpp>
+extern "C" __global__ 
+void test_bessel_j1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_j1(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_j1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_j1(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_j1_nvrtc_float.cpp
+++ b/test/test_bessel_j1_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_j1.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_j1.hpp>
+extern "C" __global__ 
+void test_bessel_j1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_j1(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_j1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_j1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_j1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_j1(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_k.cpp
+++ b/test/test_bessel_k.cpp
@@ -5,7 +5,12 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#else
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+#include <boost/math/tools/config.hpp>
+#endif

 #ifdef _MSC_VER
 #  pragma warning(disable : 4756) // overflow in constant arithmetic
--- a/test/test_bessel_k.hpp
+++ b/test/test_bessel_k.hpp
@@ -9,6 +9,7 @@
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/special_functions/bessel.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
 #include "functor.hpp"
@@ -175,6 +176,7 @@ void test_bessel(T, const char* name)
    //
    // Extra test coverage:
    //
+    #ifndef SYCL_LANGUAGE_VERSION // SYCL doesn't throw 
    BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(2), T(-1)), std::domain_error);
    BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(2.2), T(-1)), std::domain_error);
    BOOST_IF_CONSTEXPR(std::numeric_limits<T>::has_infinity)
@@ -194,6 +196,7 @@ void test_bessel(T, const char* name)
    BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(-1.25), T(0)), std::domain_error);
    BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(-1), T(0)), std::domain_error);
    BOOST_CHECK_THROW(boost::math::cyl_bessel_k(T(1), T(0)), std::domain_error);
+    #endif
 }


--- a/test/test_bessel_k0_double.cu
+++ b/test/test_bessel_k0_double.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_k0(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_k0(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_k0_float.cu
+++ b/test/test_bessel_k0_float.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_k0(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_k0(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_k0_nvrtc_double.cpp
+++ b/test/test_bessel_k0_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_k0.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_k0.hpp>
+extern "C" __global__ 
+void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_k0(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_k0(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_k0_nvrtc_float.cpp
+++ b/test/test_bessel_k0_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_k0.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_k0.hpp>
+extern "C" __global__ 
+void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_k0(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_k0(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_k1_double.cu
+++ b/test/test_bessel_k1_double.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_k1(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_k1(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_k1_float.cu
+++ b/test/test_bessel_k1_float.cu
@@ -0,0 +1,100 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_k1(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::detail::bessel_k1(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_k1_nvrtc_double.cpp
+++ b/test/test_bessel_k1_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_k1.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_k1.hpp>
+extern "C" __global__ 
+void test_bessel_k1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_k1(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_k1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_k1(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_k1_nvrtc_float.cpp
+++ b/test/test_bessel_k1_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_k1.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_k1.hpp>
+extern "C" __global__ 
+void test_bessel_k1_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_k1(in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k1_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_k1_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k1_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_k1(h_in1[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_kn_double.cu
+++ b/test/test_bessel_kn_double.cu
@@ -0,0 +1,105 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    boost::math::policies::policy<> pol;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_kn(2, in[i], pol);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    boost::math::policies::policy<> pol;
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(boost::math::detail::bessel_kn(2, input_vector[i], pol));
+    }
+
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_kn_float.cu
+++ b/test/test_bessel_kn_float.cu
@@ -0,0 +1,105 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    boost::math::policies::policy<> pol;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_kn(2, in[i], pol);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    boost::math::policies::policy<> pol;
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(boost::math::detail::bessel_kn(2, input_vector[i], pol));
+    }
+
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_kn_nvrtc_double.cpp
+++ b/test/test_bessel_kn_nvrtc_double.cpp
@@ -0,0 +1,192 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_kn.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_kn.hpp>
+extern "C" __global__ 
+void test_bessel_kn_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    boost::math::policies::policy<> pol;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_kn(2, in1[i], pol);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_kn_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_kn_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_kn_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        boost::math::policies::policy<> pol;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_kn(2, h_in1[i], pol);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_kn_nvrtc_float.cpp
+++ b/test/test_bessel_kn_nvrtc_float.cpp
@@ -0,0 +1,192 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/detail/bessel_kn.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/detail/bessel_kn.hpp>
+extern "C" __global__ 
+void test_bessel_kn_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    boost::math::policies::policy<> pol;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_kn(2, in1[i], pol);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_kn_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_kn_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_kn_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        boost::math::policies::policy<> pol;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_kn(2, h_in1[i], pol);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_y.cpp
+++ b/test/test_bessel_y.cpp
@@ -3,7 +3,12 @@
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch_light.hpp>
+#else
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+#include <boost/math/tools/config.hpp>
+#endif

 #include "test_bessel_y.hpp"

@@ -234,7 +239,11 @@ void expected_results()
      ".*",                          // platform
      largest_type,                  // test type(s)
      ".*(Y[nv]|y).*Random.*",           // test data group
+      #ifdef SYCL_LANGUAGE_VERSION
+      ".*", 2000, 1000);
+      #else
      ".*", 1500, 1000);               // test function
+      #endif
   //
   // Fallback for sun has to go after the general cases above:
   //
--- a/test/test_bessel_y.hpp
+++ b/test/test_bessel_y.hpp
@@ -9,6 +9,7 @@
 #include <boost/test/unit_test.hpp>
 #include <boost/test/tools/floating_point_comparison.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
+#include <boost/math/special_functions/bessel.hpp>
 #include <boost/math/constants/constants.hpp>
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/array.hpp>
@@ -241,10 +242,12 @@ void test_bessel(T, const char* name)
       BOOST_CHECK_EQUAL(boost::math::sph_neumann(2, std::numeric_limits<T>::infinity()), T(0));
    }

+    #ifndef BOOST_MATH_NO_EXCEPTIONS
    BOOST_CHECK_THROW(boost::math::cyl_neumann(T(0), T(-1)), std::domain_error);
    BOOST_CHECK_THROW(boost::math::cyl_neumann(T(0.2), T(-1)), std::domain_error);
    BOOST_CHECK_THROW(boost::math::cyl_neumann(T(2), T(0)), std::domain_error);
    BOOST_CHECK_THROW(boost::math::sph_neumann(2, T(-2)), std::domain_error);
+    #endif
 #if LDBL_MAX_EXP > 1024
    if (std::numeric_limits<T>::max_exponent > 1024)
    {
--- a/test/test_bessel_y0_double.cu
+++ b/test/test_bessel_y0_double.cu
@@ -0,0 +1,106 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    boost::math::policies::policy<> pol;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_y0(in[i], pol);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+
+    boost::math::policies::policy<> pol;
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(boost::math::detail::bessel_y0(input_vector[i], pol));
+    }
+    
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_y0_float.cu
+++ b/test/test_bessel_y0_float.cu
@@ -0,0 +1,106 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    boost::math::policies::policy<> pol;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_y0(in[i], pol);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+
+    boost::math::policies::policy<> pol;
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(boost::math::detail::bessel_y0(input_vector[i], pol));
+    }
+    
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_y0_nvrtc_double.cpp
+++ b/test/test_bessel_y0_nvrtc_double.cpp
@@ -0,0 +1,194 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/policies/policy.hpp>
+#include <boost/math/special_functions/detail/bessel_y0.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/policies/policy.hpp>
+#include <boost/math/special_functions/detail/bessel_y0.hpp>
+extern "C" __global__ 
+void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    boost::math::policies::policy<> pol;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_y0(in1[i], pol);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        boost::math::policies::policy<> pol;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_y0(h_in1[i], pol);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_y0_nvrtc_float.cpp
+++ b/test/test_bessel_y0_nvrtc_float.cpp
@@ -0,0 +1,194 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/policies/policy.hpp>
+#include <boost/math/special_functions/detail/bessel_y0.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/policies/policy.hpp>
+#include <boost/math/special_functions/detail/bessel_y0.hpp>
+extern "C" __global__ 
+void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    boost::math::policies::policy<> pol;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_y0(in1[i], pol);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        boost::math::policies::policy<> pol;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_y0(h_in1[i], pol);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_y1_double.cu
+++ b/test/test_bessel_y1_double.cu
@@ -0,0 +1,106 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    boost::math::policies::policy<> pol;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_y1(in[i], pol);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+
+    boost::math::policies::policy<> pol;
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(boost::math::detail::bessel_y1(input_vector[i], pol));
+    }
+    
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_y1_float.cu
+++ b/test/test_bessel_y1_float.cu
@@ -0,0 +1,106 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    boost::math::policies::policy<> pol;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_y1(in[i], pol);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+
+    boost::math::policies::policy<> pol;
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(boost::math::detail::bessel_y1(input_vector[i], pol));
+    }
+    
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_bessel_y1_nvrtc_double.cpp
+++ b/test/test_bessel_y1_nvrtc_double.cpp
@@ -0,0 +1,194 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/policies/policy.hpp>
+#include <boost/math/special_functions/detail/bessel_y1.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/policies/policy.hpp>
+#include <boost/math/special_functions/detail/bessel_y1.hpp>
+extern "C" __global__ 
+void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    boost::math::policies::policy<> pol;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_y1(in1[i], pol);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        boost::math::policies::policy<> pol;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_y1(h_in1[i], pol);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_bessel_y1_nvrtc_float.cpp
+++ b/test/test_bessel_y1_nvrtc_float.cpp
@@ -0,0 +1,194 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/policies/policy.hpp>
+#include <boost/math/special_functions/detail/bessel_y1.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/policies/policy.hpp>
+#include <boost/math/special_functions/detail/bessel_y1.hpp>
+extern "C" __global__ 
+void test_bessel_k0_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    boost::math::policies::policy<> pol;
+    if (i < numElements)
+    {
+        out[i] = boost::math::detail::bessel_y1(in1[i], pol);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_bessel_k0_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_bessel_k0_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_bessel_k0_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        boost::math::policies::policy<> pol;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::detail::bessel_y1(h_in1[i], pol);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_bessel_i_double.cu
+++ b/test/test_cyl_bessel_i_double.cu
@@ -0,0 +1,104 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::cyl_bessel_i(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_bessel_i_float.cu
+++ b/test/test_cyl_bessel_i_float.cu
@@ -0,0 +1,104 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::cyl_bessel_i(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_bessel_i_nvrtc_double.cpp
+++ b/test/test_cyl_bessel_i_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/bessel.hpp>
+extern "C" __global__ 
+void test_cyl_bessel_i_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_i_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_bessel_i_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_i_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_bessel_i(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_bessel_i_nvrtc_float.cpp
+++ b/test/test_cyl_bessel_i_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/bessel.hpp>
+extern "C" __global__ 
+void test_cyl_bessel_i_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_i(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_i_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_bessel_i_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_i_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_bessel_i(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_bessel_j_double.cu
+++ b/test/test_cyl_bessel_j_double.cu
@@ -0,0 +1,104 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::cyl_bessel_j(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_bessel_j_float.cu
+++ b/test/test_cyl_bessel_j_float.cu
@@ -0,0 +1,104 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::cyl_bessel_j(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_bessel_j_nvrtc_double.cpp
+++ b/test/test_cyl_bessel_j_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/bessel.hpp>
+extern "C" __global__ 
+void test_cyl_bessel_j_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_bessel_j(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_bessel_j_nvrtc_float.cpp
+++ b/test/test_cyl_bessel_j_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/bessel.hpp>
+extern "C" __global__ 
+void test_cyl_bessel_j_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_j(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_bessel_j(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_bessel_k_double.cu
+++ b/test/test_cyl_bessel_k_double.cu
@@ -0,0 +1,104 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::cyl_bessel_k(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_bessel_k_float.cu
+++ b/test/test_cyl_bessel_k_float.cu
@@ -0,0 +1,104 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::cyl_bessel_k(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_bessel_k_nvrtc_double.cpp
+++ b/test/test_cyl_bessel_k_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/bessel.hpp>
+extern "C" __global__ 
+void test_cyl_bessel_k_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_k_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_bessel_k_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_k_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_bessel_k(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_bessel_k_nvrtc_float.cpp
+++ b/test/test_cyl_bessel_k_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/bessel.hpp>
+extern "C" __global__ 
+void test_cyl_bessel_k_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_bessel_k(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_k_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_bessel_k_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_k_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_bessel_k(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_neumann_double.cu
+++ b/test/test_cyl_neumann_double.cu
@@ -0,0 +1,116 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_neumann(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::cyl_neumann(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    bool failed = false;
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(output_vector[i]) && std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000)
+            {
+                std::cout << "error at line: " << i
+                            << "\nParallel: " << results[i]
+                            << "\n  Serial: " << output_vector[i]
+                            << "\n    Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_neumann_float.cu
+++ b/test/test_cyl_neumann_float.cu
@@ -0,0 +1,104 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_neumann(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = rand()/(float_type)RAND_MAX;
+        input_vector2[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::cyl_neumann(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_cyl_neumann_nvrtc_double.cpp
+++ b/test/test_cyl_neumann_nvrtc_double.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/bessel.hpp>
+extern "C" __global__ 
+void test_cyl_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_neumann(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_neumann_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_neumann_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_neumann_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_neumann(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_cyl_neumann_nvrtc_float.cpp
+++ b/test/test_cyl_neumann_nvrtc_float.cpp
@@ -0,0 +1,190 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/bessel.hpp>
+extern "C" __global__ 
+void test_cyl_neumann_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::cyl_neumann(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_neumann_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_neumann_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_neumann_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1000.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::cyl_neumann(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_sph_bessel_double.cu
+++ b/test/test_sph_bessel_double.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const unsigned *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_bessel(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<unsigned> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    std::mt19937_64 rng {42};
+    std::uniform_int_distribution<unsigned> order(1, 100);
+    std::uniform_real_distribution<float_type> val(0, 100);
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = order(rng);
+        input_vector2[i] = val(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::sph_bessel(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    bool failed = false;
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(output_vector[i]) && std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 3000)
+            {
+                std::cout << "error at line: " << i
+                            << "\nParallel: " << results[i]
+                            << "\n  Serial: " << output_vector[i]
+                            << "\n    Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_sph_bessel_float.cu
+++ b/test/test_sph_bessel_float.cu
@@ -0,0 +1,119 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const unsigned *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_bessel(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<unsigned> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    std::mt19937_64 rng {42};
+    std::uniform_int_distribution<unsigned> order(1, 100);
+    std::uniform_real_distribution<float_type> val(0, 100);
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = order(rng);
+        input_vector2[i] = val(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::sph_bessel(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    bool failed = false;
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (std::isfinite(output_vector[i]) && std::isfinite(results[i]))
+        {
+            if (boost::math::epsilon_difference(output_vector[i], results[i]) > 150)
+            {
+                std::cout << "error at line: " << i
+                            << "\nParallel: " << results[i]
+                            << "\n  Serial: " << output_vector[i]
+                            << "\n    Dist: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
+                failed = true;
+            }
+        }
+    }
+
+    if (failed)
+    {
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
--- a/test/test_sph_bessel_nvrtc_double.cpp
+++ b/test/test_sph_bessel_nvrtc_double.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/bessel.hpp>
+extern "C" __global__ 
+void test_cyl_bessel_j_kernel(const unsigned *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_bessel(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        unsigned *h_in1, *d_in1;
+        float_type *h_in2, *h_out;
+        float_type *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new unsigned[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_int_distribution<unsigned> order(1, 100);
+        std::uniform_real_distribution<float_type> val(0.0f, 100.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(order(rng));
+            h_in2[i] = static_cast<float_type>(val(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(unsigned)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(unsigned), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        bool failed = false;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::sph_bessel(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 3000)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                    failed = true;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (failed)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/test/test_sph_bessel_nvrtc_float.cpp
+++ b/test/test_sph_bessel_nvrtc_float.cpp
@@ -0,0 +1,199 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/bessel.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/bessel.hpp>
+extern "C" __global__ 
+void test_cyl_bessel_j_kernel(const unsigned *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::sph_bessel(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_cyl_bessel_j_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_cyl_bessel_j_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_cyl_bessel_j_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        unsigned *h_in1, *d_in1;
+        float_type *h_in2, *h_out;
+        float_type *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new unsigned[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_int_distribution<unsigned> order(1, 100);
+        std::uniform_real_distribution<float_type> val(0.0f, 100.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(order(rng));
+            h_in2[i] = static_cast<float_type>(val(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(unsigned)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(unsigned), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        bool failed = false;
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::sph_bessel(h_in1[i], h_in2[i]);
+            
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 3000)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                    failed = true;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        if (failed)
+        {
+            return 1;
+        }
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
--- a/Show More
+++ b/Show More